1 """古诗文网爬虫"""
2
3
4 import re
5 import requests
6
7 def parse_page(url):
8 headers = {
9 'User-Agent': 'Mozilla/5.0',
10 }
11
12 response = requests.get(url, headers)
13 # print(response.text)
14 text = response.text
15
16 # re解析
17 titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text, re.DOTALL) # .本不会匹配\n,加上参数re.DOTALL即对任何字符都有效
18 # print(titles)
19 dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.DOTALL)
20 # print(dynasties)
21 authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
22 # print(authors)
23 content_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>', text, re.DOTALL)
24 # print(content_tags)
25 contents = []
26 for content in content_tags:
27 x = re.sub(r'<.*>', "", content).strip()
28 contents.append(x)
29 poems = []
30 for value in zip(titles, dynasties, authors, contents):
31 title, dynasty, author, content = value
32 poem = {
33 'title': title,
34 'dynasty': dynasty,
35 'author': author,
36 'content': content
37 }
38 poems.append(poem)
39
40 # 输出诗文记录
41 for poem in poems:
42 print(poem)
43
44
45 def main():
46 url = "https://www.gushiwen.org/default_{}.aspx"
47 for x in range(1, 11):
48 newurl = url.format(x)
49 parse_page(newurl)
50
51 if __name__ == '__main__':
52 main()