Loading

【爬虫】项目篇-豆瓣读书Top250(https://book.douban.com/top250)

抓取豆瓣读书Top250(https://book.douban.com/top250)
每本书的书名、作者、出版社、出版时间、价格、评分等数据,
将结果分别保存为csv文件和excel文件

import xlwt
import xlsxwriter
import re
import requests
from fake_useragent import UserAgent
import cchardet
import time
import random
import csv
from lxml import html
# 抓取豆瓣读书Top250(https://book.douban.com/top250)
# 每本书的书名、作者、出版社、出版时间、价格、评分等数据,
# 将结果分别保存为csv文件和excel文件

#请求网页
def get_url(url):
    source=requests.get(url,headers=headers,proxies=proxies)
    source.encoding=cchardet.detect(source.content)['encoding']
    source=html.fromstring(source.text)
    return source

#解析网页
def parse_url(source):
    global  x
    for i in range(1,26):
        a=source.xpath('//table[%d]/tr/td[@valign="top"][2]'%i)
        for i in a:
            x=x+1
            #链接
            href=''.join(i.xpath('div/a/@href'))
            #书名
            name=''.join(i.xpath('div/a/@title'))
            info_list=i.xpath('p[@class="pl"]/text()')[0].split(' / ')
            #作者
            writer=info_list[0]
            #译者
            if len(info_list)<=4:
                translator=""
            else:
                translator=info_list[-4]
            #出版社
            press=info_list[-3]
            #出版时间
            publish_time=info_list[-2]
            #价格
            price=info_list[-1]
            #评分
            score=''.join(i.xpath('div/span[2]/text()'))
            #摘要
            try:
                abs=''.join(i.xpath('p[@class="quote"]/span/text()'))
            except Exception:
                abs=""
            print(x,href,name,writer,translator,press,publish_time,price,score,abs)
            datalist=[href,name,writer,translator,press,publish_time,price,score,abs]
            save_csv(datalist)
            save_excel(datalist)
    print("保存成功")


#保存为excel文件
def save_excel(datalist):
    for j in range(0,9):
        print(datalist[j])
        worksheet.write(x,j,datalist[j])


#保存为csv文件
def save_csv(datalist):
    with open('DouBanReadingTop250.csv','a+',newline='',encoding='utf-8-sig') as file:
        csv.writer(file).writerow(datalist)


if __name__ == '__main__':

    base_url="https://book.douban.com/top250?start="
    headers={
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30',
        #'use-agent':UserAgent().ie,
        'cookie':'bid=TLTkVgoSwQk; douban-fav-remind=1; __gads=ID=b627338abfa00e2e-22b5bdc80bcb000c:T=1629370026:RT=1629370026:S=ALNI_MYVZYapcFsvxEZuTcIsxjptz_2osQ; ll="118200"; dbcl2="144119796:5CQj234vSRY"; push_doumail_num=0; push_noty_num=0; __utmv=30149280.14411; __utmz=30149280.1634226952.14.12.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ck=JmUh; _pk_ses.100001.3ac3=*; __utma=30149280.1284446217.1629370028.1634226952.1635775501.15; __utmc=30149280; __utmt_douban=1; __utma=81379588.517020736.1635775501.1635775501.1635775501.1; __utmc=81379588; __utmz=81379588.1635775501.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; gr_user_id=99ce56ab-fb6b-4016-8a0a-356e0d50447e; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=d60a4447-6384-4429-972b-3b07737ce1cd; gr_cs1_d60a4447-6384-4429-972b-3b07737ce1cd=user_id:1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_d60a4447-6384-4429-972b-3b07737ce1cd=true; _vwo_uuid_v2=D4578049133BF9C1DE3C7C04C01F84037|46386fb4041c76b508504389f30a80f0; _pk_id.100001.3ac3=787e285429a5299f.1635775501.1.1635775829.1635775501.; __utmb=30149280.14.10.1635775501; __utmb=81379588.14.10.1635775501'
    }
    #添加代理
    proxies={
        'HTTP': '211.65.197.93:80', 'HTTP': '60.161.148.9:8118', 'HTTP': '60.161.148.9:8118',
        'HTTP': '60.174.197.23:7890', 'HTTP': '1.71.143.122:6969', 'HTTP': '60.174.197.23:7890',
        'HTTP': '60.174.197.23:7890', 'HTTP': '117.88.71.66:3000', 'HTTP': '61.216.185.88:60808',
        'HTTP': '60.174.197.23:7890', 'HTTP': '222.78.6.70:8083', 'HTTP': '211.65.197.93:80',
        'HTTP': '180.122.147.85:3000', 'HTTP': '139.227.144.203:7890', 'HTTP': '117.88.71.173:3000'
    }
    #设置列名
    colname='链接','书名','作者','译者','出版社','出版时间','价格','评分','摘要'
    #创建csv文件
    with open('DouBanReadingTop250.csv','w',newline='',encoding='utf-8-sig') as file:
        a=csv.writer(file)
        a.writerow(list(colname))
    #创建xlsx文件
    workbook = xlsxwriter.Workbook('DouBanReadingTop250.xlsx')
    worksheet = workbook.add_worksheet()
    col = (colname)
    #将列名添加到表格中
    for i in range(0, 9):
        worksheet.write(0, i, col[i])
    #设置索引标签x
    x=0
    #爬取豆瓣页面
    for i in range(0,10):
        url=base_url+str(i*25)
        print(url)
        source=get_url(url)
        parse_url(source)
        time.sleep(random.randint(1,3))
    workbook. Close()
posted @ 2024-04-05 22:45  踩坑大王  阅读(648)  评论(0)    收藏  举报