1 # _author: Jolly
2 # date: 2019/8/30
3
4 import requests
5 import re
6
7 headers = {
8 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
9 }
10
11
12 def parse_url(page_url):
13 response = requests.get(page_url)
14 text = response.text
15 authors = re.findall('<div\sclass="author clearfix".*?<a.*?<h2>(.*?)</h2>.*?</a>', text, re.S)
16 dispose_authors = list(map(lambda data: data.strip(), authors))
17 # print(dispose_author)
18 contents = re.findall('<div\sclass="content">.*?<span>(.*?)</span>', text, re.DOTALL)
19
20 dispose_contents = []
21 for content in contents:
22 content = re.sub('<.*?>', "", content)
23 dispose_contents.append(content)
24 finally_contents = list(map(lambda data: data.strip(), dispose_contents))
25 # print(finally_contents)
26
27 all_contents = []
28 for data in zip(dispose_authors, finally_contents):
29 author, content = data
30 full_content = {
31 'author': author,
32 'content': content
33 }
34 all_contents.append(full_content)
35 print(all_contents)
36
37
38 def main(n):
39 url = 'https://www.qiushibaike.com/text/page/{}/'
40 for i in range(1, n+1):
41 page_url = url.format(i)
42 parse_url(page_url)
43 print("============"*20, end='\n\n')
44
45 if __name__ == '__main__':
46 main(2)