python学习之爬取小说
1 import requests 2 import re 3 from lxml import etree 4 import os 5 import time 6 7 8 def get_html(): 9 headers = {} 10 '''提供访问方式,包含操作系统,cpu,浏览器等,服务器会根据User_Agent返回不同的界面''' 11 headers['User_Agent'] = 'Mozilla/5.0 (Windows NT 10.0;' \ 12 ' Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \ 13 'Chrome/76.0.3809.132 Safari/537.36' 14 '''url代表网址''' 15 url = 'http://www.xbiquge.la/15/15021/' 16 '''content中间存的是字节码,而text中存的是Beautifulsoup根据猜测的编码方式将content内容编码成字符串,使用.decode()将其按照既定格式的编码返回html''' 17 html = requests.get(url, headers=headers).content.decode('utf-8') 18 return html 19 20 21 def get_novel_url(html): 22 '''正则表达式解析html''' 23 path2 = r"<dd><a href='(.*?)' >(.*?)</a></dd>" 24 '''re.findall() 函数可以遍历匹配,可以获取字符串中所有匹配的字符串,返回一个列表,参数一是正则表达式,参数二是查找的字符串,path2带()的即为返回的内容''' 25 title_name = re.findall(path2, html) 26 path = 'G:/剑来' 27 if not os.path.exists(path): 28 os.makedirs(path) 29 filename = path + '/' + '{}.txt'.format('剑来') 30 '''没有文档就创建一个''' 31 open(filename, 'w', encoding='utf-8') 32 for title in title_name: 33 '''title[0]:章节地址;title[1]:章节名''' 34 novel_url = title[0] 35 novel_name = title[1] 36 newUrl = r'http://www.xbiquge.la' + novel_url 37 38 response = requests.get(newUrl).content.decode('utf-8', 'ignore') 39 '''tree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正''' 40 response = etree.HTML(response) 41 content = response.xpath('//*[@id="content"]/text()') 42 try: 43 print("正在下载小说----->%s" % novel_name) 44 '''filename = path + '/' + '剑来.txt' 45 filename = path + '/' + '{}.txt'.format(novel_name)''' 46 with open(filename, 'a+', encoding='utf-8') as f: 47 f.writelines(novel_name+'\n'+'\n') 48 f.writelines(content) 49 time.sleep(1) 50 except Exception as e: 51 print("下载出错", e) 52 53 54 def main(): 55 html = get_html() 56 get_novel_url(html) 57 58 59 if __name__ == '__main__': 60 main()
人不一定要生得漂亮,但却一定要活得漂亮。
人生的精彩不在于抽到一手好牌,而在于打好一手烂牌。

浙公网安备 33010602011771号