-
Notifications
You must be signed in to change notification settings - Fork 2
Open
Description
## 从 豆瓣 页面按照 ISBN 爬取图书信息,title, author, intro, publisher, publish_date:
## http://douban.com/isbn/9787111637172
## python3 -m pip install beautifulsoup4
## python3 -m pip install lxml
from bs4 import BeautifulSoup
import time
import random
import pandas as pd
import urllib.request
import sys
import re
class Douban():
def __init__(self):
self.__r_publisher = r'出版社:</span>(.*?)<br/>'
self.__r_publish_date = r'出版年:</span>(.*?)<br/>'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'}
def get_book(self, isbn=""):
book = {"isbn": isbn}
html = self.__get_html(isbn=isbn)
if not html : # not found
return None
soup = self.__get_soup(html=html)
book["title"] = self.__getTitle(soup=soup)
book["author"] = self.__getAuthor(soup=soup)
book["intro"] = self.__get_intro(soup=soup)
book["publisher"] = self.__getpublisher(soup=soup)
book["publish_date"] = self.__getpublish_date(soup=soup)
return book
def __get_html(self, isbn=""):
url = f"http://douban.com/isbn/{isbn}/"
request = urllib.request.Request(url, headers=self.headers)
try:
response = urllib.request.urlopen(request)
except:
return None
html = response.read().decode('utf-8')
return html
def __get_soup(self, html=""):
soup = BeautifulSoup(html, 'lxml', exclude_encodings="utf-8")
return soup
def __getTitle(self, soup):
soupSelect = str(soup.select("body>div>h1>span"))
soupTemp = BeautifulSoup(str(soupSelect), 'lxml',
exclude_encodings="utf-8")
return str(soupTemp.text).strip('[] \n\t')
def __getAuthor(self, soup):
soupSelect = str(soup.select(
"body>div>div>div>div>div>div>div>div>span>a")[0])
soupTemp = BeautifulSoup(str(soupSelect), 'lxml',
exclude_encodings="utf-8")
return str(soupTemp.text).strip()
def __getpublisher(self, soup):
soupSelect = str(soup.select(
"body>div>div>div>div>div>div>div>div>a")[1])
soupTemp = BeautifulSoup(str(soupSelect), 'lxml',
exclude_encodings="utf-8")
return str(soupTemp.text).strip()
def __getpublish_date(self, soup):
soupSelect = str(soup.select(
"body>div>div>div>div>div>div>div>div"))
ans = re.findall(self.__r_publish_date, soupSelect)
if len(ans) == 0:
return ""
else:
return str(ans[0]).strip("[] \n\t")
def __get_intro(self, soup):
soupSelect = soup.select(
"body>div>div>div>div>div>div>span>div>div")
soupTemp = BeautifulSoup(str(soupSelect), 'lxml',
exclude_encodings="utf-8")
return str(soupTemp.text).strip("[] \n\t")
if __name__ == "__main__":
base = Douban()
print(base.get_book("9787111637172"))
Metadata
Metadata
Assignees
Labels
No labels