python爬虫之爬取小说(二)

爬取“全书网”《斗罗大陆》小说

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by Fzy on 2018/12/27 17:14
import requests
import re
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'
}


def get_html(first_url):
    try:
        r = requests.get(first_url, headers=headers)
        r.encoding = 'gbk'
        html = r.text
        return html
    except Exception as e:
        peat = str(e) + '\n访问失败!'
        print(peat)


def get_info(html):
    txt_info = {}
    txt_info['title'] = re.findall(r'<div class="chapName">.*?<strong>(.*?)</strong>', html)[0]
    txt_info['author'] = re.findall(r'<div class="chapName"><span class="r">(.*?)</span>', html)[0]
    return txt_info


def get_urls(html):
    li_tags = re.findall(r'<DIV class="clearfix dirconone">(.*?)</div>', html, re.S|re.I)[0]
    urls = re.findall(r'<a href="(.*?)"', li_tags)
    return urls


def save_text(urls, txt_info):
    with open(txt_info['title'] + '.txt', 'a+') as file:
        file.write(txt_info['title']+'\n\n')
        file.write(txt_info['author']+'\n')
    print('正在下载《{}》全本小说(共六百八十七章),时间稍长,请稍等......'.format(txt_info['title']))
    for url in urls:
        html = get_html(url)
        text = re.findall(r'&nbsp;&nbsp;&nbsp;&nbsp;(.*?)<br />', html)
        text1 = re.findall(r'&nbsp;&nbsp;&nbsp;&nbsp;(.*?)<script type="text/javascript">', html)[0]
        text.append(text1)
        for i in text:
            with open(txt_info['title']+'.txt', 'a+') as file:
                file.write('    '+i+'\n')
        print(text[0])


def main():
    first_url = 'http://www.quanshuwang.com/book/44/44683'
    html = get_html(first_url)
    txt_info = get_info(html)
    urls = get_urls(html)
    save_text(urls, txt_info)


if __name__ == '__main__':
    main()
posted @ 2020-09-23 10:06  chchcharlie、  阅读(243)  评论(0)    收藏  举报