用Python爬取第一次看的小说，童年的回忆 - Adventur

# -*- coding: utf-8 -*-
import requests
import re
import os
if not os.path.exists('青春无悔'):
    os.mkdir('青春无悔')
url = 'https://www.yibige.cc/110006/index.html'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4506.400'
}
response_1 = requests.get(url=url, headers=headers)
# 自动转码
response_1.encoding = response_1.apparent_encoding
html_data = response_1.text
# 获取小说章节列表
result_list = re.findall('<dd><a href="(.*?)">.*?</a></dd>', html_data, re.S)
# result_list: 获取第一次提取的链接，方便第二次提取
result_list = result_list[:525]
for result_name in result_list[:525]:
    # 链接拼接
    all_url = 'https://www.yibige.cc/110006/' + result_name
    # 发送网络请求
    response_2 = requests.get(all_url)
    response = requests.get(url=url, headers=headers)
    # 自动转码
    response_2.encoding = response.apparent_encoding
    html_data_2 = response_2.text
    # 标题
    title = re.findall('<h1>(.*?)</h1>', html_data_2, re.S)[0]
    # 文章的内容
    result = re.findall('<div id="content" class="contentjs">(.*?)</div>', html_data_2, re.S)
    txt = result[0].replace('    ', '  ').replace('</p><p>', '\n').replace('<p>', '').replace("</p><script>site_con_ad('亿笔阁','https://www.yibige.cc');</script>", '')
    # 保存
    with open('青春无悔\\' + title + '.txt', mode='w', encoding='utf-8') as f:
        f.write(txt)
        print('下载成功:', title)
发表于 2023-01-22 15:25 Adventur 阅读(71) 评论(0) 收藏举报