爬取三寸人间

#coding=gbk
import requests
from fake_useragent import UserAgent
from lxml import etree

url = 'https://www.81zw.com/book/32934/'
headers = {
    'User-Agent':UserAgent().random
}

response = requests.get(url,headers = headers)
e = etree.HTML(response.text)
txt_urls = e.xpath('//div[@id="list"]//@href')
txt_urls = ['https://www.81zw.com/' + txt_url[1:] for txt_url in txt_urls]

for num in range(len(txt_urls)):
    file = open('三寸人间.txt', 'a', encoding='utf-8-sig')
    response = requests.get(txt_urls[num], headers=headers)
    e = etree.HTML(response.content.decode('utf-8'))
    txt_title = e.xpath('//h1/text()')[0]
    txt_content = e.xpath('//div[@id="content"]/text()')
    file.write(str(txt_title) + '\n')
    for line in txt_content:
        file.write(line + '\n')
    # time.sleep(random.randint(1,3))
    print("第 {} 章下载完毕".format(num+1))
    file.close()

2020-07-15

posted @ 2020-07-15 15:57  CodeYaSuo  阅读(134)  评论(0编辑  收藏  举报