1 # -*- coding: utf-8 -*-
2 #author:zxy
3 #Date:2018-10-19
4
5
6 import requests
7 import re
8 HEADERS={
9 "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
10 "Chrome/69.0.3497.100 Safari/537.36"
11 }
12
13
14 def parse_url(url):
15 response=requests.get(url,headers=HEADERS)
16 text=response.text
17 titles=re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL) #r raw
18 dynasties=re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
19 authors=re.findall(r'<p\sclass="source">.*?<a.*?<a.*?>(.*?)</a>',text,re.DOTALL)
20 content_tags=re.findall(r'<div\sclass="contson".*?>(.*?)</div>',text,re.DOTALL)
21 contents=[]
22 for content_tag in content_tags:
23 x=re.sub('<.*?>','',content_tag)
24 xx=re.sub('。', '。\n',x)
25 contents.append(xx.strip())
26 poems=[]
27 for value in zip(titles,dynasties,authors,contents):
28 title,dynasty,author,content=value
29 poem={
30 "title":title,
31 "dynasty":dynasty,
32 "author":author,
33 "content":content
34 }
35 poems.append(poem)
36
37 with open('poems.txt','w',encoding="utf-8") as f:
38 for poem in poems:
39 for (key,value) in poem.items():
40 if(key=="title"):
41 f.write("{}\n".format(value))
42 if (key == "dynasty"):
43 f.write("\t{}\n".format(value))
44 if(key=="author"):
45 str="\t{}\n"
46 f.write(str.format(value))
47 if(key=="content"):
48 print(value)
49 f.write("{}\n\n\n".format(value))
50 # print(x+"{}\n\n\n".format(value))
51
52 if __name__ == '__main__':
53 url="https://www.gushiwen.org/default_1.aspx"
54 parse_url(url)