#!-*-coding:utf-8-*-
import requests
import xlwt
from bs4 import BeautifulSoup
from collections import OrderedDict
class DouBanBookSpider(object):
def __init__(self, book_type, quantity):
self.book_type = book_type
self.quantity = quantity
self.url_list = []
self.book_dict = OrderedDict()
self.count = 0
#获取url
def get_url(self):
count = 0
while count < self.quantity+1:
url = 'https://book.douban.com/tag/%s?start=%d&type=S' % (self.book_type, count)
self.url_list.append(url)
#每页20本书,
count += 20
return self.url_list
#爬虫主体
def main_spider(self, url):
rsp = requests.get(url)
tag_bf = BeautifulSoup(rsp.text, 'lxml')
content = tag_bf.find_all('li', class_='subject-item')
if content:
for i in content:
bt_bf = BeautifulSoup(str(i), 'lxml')
self.count += 1
book_name = bt_bf.h2.a.get_text(strip=True)
author = bt_bf.find('div', class_='pub').string.strip()
comment_info = bt_bf.find('div', class_='star clearfix')
co_bf = BeautifulSoup(str(comment_info), 'lxml')
grade = co_bf.find('span', class_='rating_nums')
if grade:
grade = grade.string
comment_count = co_bf.find('span', class_='pl').string.strip()
self.book_dict[str(self.count)] = {'序号': self.count, '书名': book_name, '评分': grade, '评论数': comment_count, '作者': author}
else:
return
#执行爬虫
def do_spider(self):
for i in self.get_url():
self.main_spider(i)
#数据写入excel
def write_excel(self):
wb = xlwt.Workbook(encoding='ascii')
ws = wb.add_sheet(self.book_type)
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = 'Times New Roman'
font.bold = True
style.font = font
row0 = ['序号', '书名', '评分', '评论数', '出版信息']
for i in range(0, len(row0)):
ws.write(0, i, row0[i], style)
for k, v in self.book_dict.items():
for j in range(0, len(v.values())):
ws.write(int(k), j, list(v.values())[j])
wb.save('%s.xlsx' % self.book_type)
if __name__ == "__main__":
ds = DouBanBookSpider('中国历史', 2000)
ds.do_spider()
ds.write_excel()