1 import requests
2 import re
3
4 url = 'http://www.cae.cn/cae/html/main/col48/column_48_1.html'
5
6 html = requests.get(url) #获取网页源代码
7
8 html.encoding = 'utf-8' #编码格式
9
10 nuber = re.findall(r'<a href="/cae/html/main/colys/(\d+).html" target="_blank">',html.text)
11
12 for n in nuber[:2]:
13 nextUrl = 'http://www.cae.cn/cae/html/main/colys/{}.html'.format(n) #获取所有url
14 text = requests.get(nextUrl)
15 text.encoding = 'utf-8'
16 text2 = re.findall('<div class="intro">(.*?)</div>',text.text,re.S) #正则表达式
17 text3 = re.sub(r' |<p>| |</p>','',text2[0]).strip() #去掉特殊字符
18 print(text3)
19 with open(r'C:\Users\Administrator\Desktop\888.txt',mode= 'a+') as f: #指定文件路径,追加格式
20 f.write(text3 + '\n'*2) #写入文件