1 # -*- coding: utf-8 -*-
2 from bs4 import BeautifulSoup
3 import concurrent.futures
4 import requests
5
6
7
8 hd = {
9 "cept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
10 "Accept-Encoding": "gzip, deflate",
11 "Accept-Language": "zh-CN,zh;q=0.9",
12 "Cache-Control": "max-age=0",
13 "Connection": "keep-alive",
14 "Host": "www.xxxx.com",
15 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
16 }
17
18 # 输出到文件
19 def write(path, text):
20 f1 = open(path, 'ab')
21 f1.write(bytes(text, encoding="utf-8"))
22 f1.close()
23
24 # 爬取动作
25 def start(url):
26 demo = BeautifulSoup(requests.get(url, headers = hd).text, "html.parser")
27 for a in demo.find_all('div', class_='textlist-body'):
28 print(a)
29 # write("out.txt", url)
30 write("out.txt", '{}, {}\n'.format(url, a))
31
32
33 def Country_url():
34 url = "https://www.xxxx.com/"
35 demo = BeautifulSoup(requests.get(url+"airports", headers = hd).text, "html.parser")
36
37 for i in demo.find_all('div', class_='textlist-body'):
38 url_li = [url+x.string.replace(" ", "-") for x in i if x != " "] # 生成URL列表
39 with concurrent.futures.ThreadPoolExecutor() as executor:
40 results = executor.map(start, url_li) # 利用map对列表传递
41 for result in results:
42 print(result)
43
44
45
46 if __name__ == "__main__":
47 Country_url()