python爬虫之爬取小说(二)
爬取“全书网”《斗罗大陆》小说
#!/usr/bin/env python # -*- coding: utf-8 -*- # Created by Fzy on 2018/12/27 17:14 import requests import re headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36' } def get_html(first_url): try: r = requests.get(first_url, headers=headers) r.encoding = 'gbk' html = r.text return html except Exception as e: peat = str(e) + '\n访问失败!' print(peat) def get_info(html): txt_info = {} txt_info['title'] = re.findall(r'<div class="chapName">.*?<strong>(.*?)</strong>', html)[0] txt_info['author'] = re.findall(r'<div class="chapName"><span class="r">(.*?)</span>', html)[0] return txt_info def get_urls(html): li_tags = re.findall(r'<DIV class="clearfix dirconone">(.*?)</div>', html, re.S|re.I)[0] urls = re.findall(r'<a href="(.*?)"', li_tags) return urls def save_text(urls, txt_info): with open(txt_info['title'] + '.txt', 'a+') as file: file.write(txt_info['title']+'\n\n') file.write(txt_info['author']+'\n') print('正在下载《{}》全本小说(共六百八十七章),时间稍长,请稍等......'.format(txt_info['title'])) for url in urls: html = get_html(url) text = re.findall(r' (.*?)<br />', html) text1 = re.findall(r' (.*?)<script type="text/javascript">', html)[0] text.append(text1) for i in text: with open(txt_info['title']+'.txt', 'a+') as file: file.write(' '+i+'\n') print(text[0]) def main(): first_url = 'http://www.quanshuwang.com/book/44/44683' html = get_html(first_url) txt_info = get_info(html) urls = get_urls(html) save_text(urls, txt_info) if __name__ == '__main__': main()

浙公网安备 33010602011771号