【爬虫】项目篇-豆瓣读书Top250(https://book.douban.com/top250)
抓取豆瓣读书Top250(https://book.douban.com/top250)
每本书的书名、作者、出版社、出版时间、价格、评分等数据,
将结果分别保存为csv文件和excel文件
import xlwt
import xlsxwriter
import re
import requests
from fake_useragent import UserAgent
import cchardet
import time
import random
import csv
from lxml import html
# 抓取豆瓣读书Top250(https://book.douban.com/top250)
# 每本书的书名、作者、出版社、出版时间、价格、评分等数据,
# 将结果分别保存为csv文件和excel文件
#请求网页
def get_url(url):
source=requests.get(url,headers=headers,proxies=proxies)
source.encoding=cchardet.detect(source.content)['encoding']
source=html.fromstring(source.text)
return source
#解析网页
def parse_url(source):
global x
for i in range(1,26):
a=source.xpath('//table[%d]/tr/td[@valign="top"][2]'%i)
for i in a:
x=x+1
#链接
href=''.join(i.xpath('div/a/@href'))
#书名
name=''.join(i.xpath('div/a/@title'))
info_list=i.xpath('p[@class="pl"]/text()')[0].split(' / ')
#作者
writer=info_list[0]
#译者
if len(info_list)<=4:
translator=""
else:
translator=info_list[-4]
#出版社
press=info_list[-3]
#出版时间
publish_time=info_list[-2]
#价格
price=info_list[-1]
#评分
score=''.join(i.xpath('div/span[2]/text()'))
#摘要
try:
abs=''.join(i.xpath('p[@class="quote"]/span/text()'))
except Exception:
abs=""
print(x,href,name,writer,translator,press,publish_time,price,score,abs)
datalist=[href,name,writer,translator,press,publish_time,price,score,abs]
save_csv(datalist)
save_excel(datalist)
print("保存成功")
#保存为excel文件
def save_excel(datalist):
for j in range(0,9):
print(datalist[j])
worksheet.write(x,j,datalist[j])
#保存为csv文件
def save_csv(datalist):
with open('DouBanReadingTop250.csv','a+',newline='',encoding='utf-8-sig') as file:
csv.writer(file).writerow(datalist)
if __name__ == '__main__':
base_url="https://book.douban.com/top250?start="
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30',
#'use-agent':UserAgent().ie,
'cookie':'bid=TLTkVgoSwQk; douban-fav-remind=1; __gads=ID=b627338abfa00e2e-22b5bdc80bcb000c:T=1629370026:RT=1629370026:S=ALNI_MYVZYapcFsvxEZuTcIsxjptz_2osQ; ll="118200"; dbcl2="144119796:5CQj234vSRY"; push_doumail_num=0; push_noty_num=0; __utmv=30149280.14411; __utmz=30149280.1634226952.14.12.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ck=JmUh; _pk_ses.100001.3ac3=*; __utma=30149280.1284446217.1629370028.1634226952.1635775501.15; __utmc=30149280; __utmt_douban=1; __utma=81379588.517020736.1635775501.1635775501.1635775501.1; __utmc=81379588; __utmz=81379588.1635775501.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; gr_user_id=99ce56ab-fb6b-4016-8a0a-356e0d50447e; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=d60a4447-6384-4429-972b-3b07737ce1cd; gr_cs1_d60a4447-6384-4429-972b-3b07737ce1cd=user_id:1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_d60a4447-6384-4429-972b-3b07737ce1cd=true; _vwo_uuid_v2=D4578049133BF9C1DE3C7C04C01F84037|46386fb4041c76b508504389f30a80f0; _pk_id.100001.3ac3=787e285429a5299f.1635775501.1.1635775829.1635775501.; __utmb=30149280.14.10.1635775501; __utmb=81379588.14.10.1635775501'
}
#添加代理
proxies={
'HTTP': '211.65.197.93:80', 'HTTP': '60.161.148.9:8118', 'HTTP': '60.161.148.9:8118',
'HTTP': '60.174.197.23:7890', 'HTTP': '1.71.143.122:6969', 'HTTP': '60.174.197.23:7890',
'HTTP': '60.174.197.23:7890', 'HTTP': '117.88.71.66:3000', 'HTTP': '61.216.185.88:60808',
'HTTP': '60.174.197.23:7890', 'HTTP': '222.78.6.70:8083', 'HTTP': '211.65.197.93:80',
'HTTP': '180.122.147.85:3000', 'HTTP': '139.227.144.203:7890', 'HTTP': '117.88.71.173:3000'
}
#设置列名
colname='链接','书名','作者','译者','出版社','出版时间','价格','评分','摘要'
#创建csv文件
with open('DouBanReadingTop250.csv','w',newline='',encoding='utf-8-sig') as file:
a=csv.writer(file)
a.writerow(list(colname))
#创建xlsx文件
workbook = xlsxwriter.Workbook('DouBanReadingTop250.xlsx')
worksheet = workbook.add_worksheet()
col = (colname)
#将列名添加到表格中
for i in range(0, 9):
worksheet.write(0, i, col[i])
#设置索引标签x
x=0
#爬取豆瓣页面
for i in range(0,10):
url=base_url+str(i*25)
print(url)
source=get_url(url)
parse_url(source)
time.sleep(random.randint(1,3))
workbook. Close()

浙公网安备 33010602011771号