python学习之爬取小说

 1 import requests
 2 import re
 3 from lxml import etree
 4 import os
 5 import time
 6 
 7 
 8 def get_html():
 9     headers = {}
10     '''提供访问方式,包含操作系统,cpu,浏览器等,服务器会根据User_Agent返回不同的界面'''
11     headers['User_Agent'] = 'Mozilla/5.0 (Windows NT 10.0;' \
12                             ' Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
13                             'Chrome/76.0.3809.132 Safari/537.36'
14     '''url代表网址'''
15     url = 'http://www.xbiquge.la/15/15021/'
16     '''content中间存的是字节码,而text中存的是Beautifulsoup根据猜测的编码方式将content内容编码成字符串,使用.decode()将其按照既定格式的编码返回html'''
17     html = requests.get(url, headers=headers).content.decode('utf-8')
18     return html
19 
20 
21 def get_novel_url(html):
22     '''正则表达式解析html'''
23     path2 = r"<dd><a href='(.*?)' >(.*?)</a></dd>"
24     '''re.findall() 函数可以遍历匹配,可以获取字符串中所有匹配的字符串,返回一个列表,参数一是正则表达式,参数二是查找的字符串,path2带()的即为返回的内容'''
25     title_name = re.findall(path2, html)
26     path = 'G:/剑来'
27     if not os.path.exists(path):
28         os.makedirs(path)
29     filename = path + '/' + '{}.txt'.format('剑来')
30     '''没有文档就创建一个'''
31     open(filename, 'w', encoding='utf-8')
32     for title in title_name:
33         '''title[0]:章节地址;title[1]:章节名'''
34         novel_url = title[0]
35         novel_name = title[1]
36         newUrl = r'http://www.xbiquge.la' + novel_url
37 
38         response = requests.get(newUrl).content.decode('utf-8', 'ignore')
39         '''tree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正'''
40         response = etree.HTML(response)
41         content = response.xpath('//*[@id="content"]/text()')
42         try:
43             print("正在下载小说----->%s" % novel_name)
44             '''filename = path + '/' + '剑来.txt'
45             filename = path + '/' + '{}.txt'.format(novel_name)'''
46             with open(filename, 'a+', encoding='utf-8') as f:
47                 f.writelines(novel_name+'\n'+'\n')
48                 f.writelines(content)
49                 time.sleep(1)
50         except Exception as e:
51             print("下载出错", e)
52 
53 
54 def main():
55     html = get_html()
56     get_novel_url(html)
57 
58 
59 if __name__ == '__main__':
60     main()

 

posted @ 2019-10-20 21:54  大壞狐狸  阅读(184)  评论(0)    收藏  举报