python爬虫之爬取小说(三)

爬取”顶点小说网“《纯阳剑尊》

import requests
from bs4 import BeautifulSoup
# 反爬
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, \
    like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}

# 获得请求
def open_url(url):
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = response.text
    return html

# 提取标题
def get_title(url):
    soup = BeautifulSoup(url, 'lxml')
    title_tag = soup.find('dd')
    title = '\n' + title_tag.h1.get_text() + '\n'
    return title

# 提取文本
def get_texts(url):
    soup2 = BeautifulSoup(url, 'lxml')
    text_tags = soup2.find_all('dd', id="contents")
    return text_tags

# 保存标题
def save_title(filename, title):
    with open(filename, 'a+', encoding='utf-8') as file:
        file.write(title)

# 保存文本
def save_text(filename, text):
    with open(filename, 'a+', encoding='utf-8') as file:
        file.write(text)

# 主程序函数
def main():
    num = input('《纯阳剑尊》你想要下载第几章?(1-802)')
    num = int(num)
    number = 8184027 + num
    url = 'https://www.23us.so/files/article/html/15/15905/' + str(number) + '.html'
    filename = '纯阳剑尊.txt'
    r = open_url(url)
    title = get_title(r)
    tags = get_texts(r)
    save_title(filename, title)
    for text_tag in tags:
        text = text_tag.get_text() + '\n'
        save_text(filename, text)
    print('第{}章已经下载完成!'.format(num))

if __name__ == '__main__':
    main()
posted @ 2020-09-23 10:07  chchcharlie、  阅读(207)  评论(0)    收藏  举报