爬取三国演义的章节和内容

import requests
from bs4 import BeautifulSoup
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4315.5 Safari/537.36'
}

url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
response = requests.get(url=url,headers=headers)
# print(response.encoding)  # 查看返回数据的编码
response.encoding = 'utf-8'  # 指定字符集防止乱码
page_text = response.text

soup = BeautifulSoup(page_text,'lxml')
li_list = soup.select('.book-mulu > ul >li')
fp = open('./sanguoyanyi3.txt','w',encoding='utf-8')
for li in li_list:
    title = li.a.string
    detail_url = 'https://www.shicimingju.com' + li.a['href']
    response_detail = requests.get(url=detail_url,headers=headers)
    response_detail.encoding = 'utf-8'
    detail_text = response_detail.text
    detail_soup = BeautifulSoup(detail_text,'lxml')
    content = detail_soup.find('div', class_='chapter_content').text
    fp.write(title  +':' + content + '\n')
    print(title,'下载完毕!!')
posted @ 2021-02-17 01:55  未来全栈攻城狮  阅读(132)  评论(0编辑  收藏  举报