Python爬虫爬取中国古诗词网上的名句
运行截图:
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 import requests 2 from lxml import etree 3 from urllib import request 4 5 # 全局变量(请求头+文件IO对象) 6 headers = { 7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'} 8 file=open('./古诗名句.txt','w',encoding='utf-8') 9 10 11 # 采集前端源码 12 def index(): 13 for num in range(1, 12): 14 base_url = 'https://so.gushiwen.cn/mingju/default.aspx?p={}&c=&t='.format(num) # 网站翻页 15 print('正在写入', base_url, '中的数据信息...') 16 response = requests.get(base_url, headers=headers) # 模拟访问+请求头 17 response.encoding = 'utf-8' # 解码 18 html = response.text # 获取源码 19 clean(html) # 清洗数据 20 21 22 # 清洗数据 23 def clean(html): 24 htmls=etree.HTML(html)#预处理 25 #xpath守则 26 Mingjus_urls=htmls.xpath('//div[@class="cont"]/a[1]/@href')#名句网址(待处理) 27 #print(Mingjus_urls) 28 Mingjus=htmls.xpath('//div[@class="cont"]/a[1]/text()')#名句 29 #print(Mingjus) 30 Poem_titles=htmls.xpath('//div[@class="cont"]/a[2]/text()') 31 #print(Poem_titles) 32 sto(Mingjus_urls,Mingjus,Poem_titles) 33 34 # 打印数据 35 def sto(Mingjus_urls,Mingjus,Poem_titles): 36 for M,Mingju,Poem_title in zip(Mingjus_urls,Mingjus,Poem_titles): 37 #拼接网址 38 Mingju_url='https://so.gushiwen.cn/'+M 39 #整合数据信息 40 full_info=Mingju+'\t'+Poem_title+'\n'+'名句网址:'+Mingju_url 41 #写入文件 42 file.write(full_info+'\n') 43 44 if __name__ == '__main__': 45 index() 46 file.close()