谷歌搜索批量采集第一页结果title

 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # author:么么哒
 4 import requests
 5 import re
 6 from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, FIRST_COMPLETED
 7 requests.packages.urllib3.disable_warnings()
 8 
 9 proxy = {
10     'http':'127.0.0.1:1080',
11     'https':'127.0.0.1:1080'
12 }
13 heads={
14     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36',
15     'Content-Type':'text/xml; charset=utf-8'
16 }
17 
18 
19 def Reptile(text):
20 
21     try:
22         target = 'https://www.google.com/search?q={}'.format(text)
23         r = requests.get(target,headers=heads,timeout=30,proxies=proxy,verify=False)
24         meme = r.text
25         pattern = re.compile(r'class="LC20lb MBeuO DKV0Md">(.*?)</h3><div class=')
26         result = re.findall(pattern, meme)
27         #print(meme)
28         print(target)
29         for results in result:
30             with open("google-title.txt",'a+',encoding='utf-8') as f:
31                 f.write(results +"\n")
32                 f.close()
33     except Exception as e:
34         print("error")
35 
36 domain =[]
37 for line in open(r'test.txt','r', encoding='utf-8'):
38     domain.append(line.strip('\n'))
39     
40 executor = ThreadPoolExecutor(max_workers=20)
41 all_task = [executor.submit(Reptile, (text)) for text in domain]
42 wait(all_task, return_when=FIRST_COMPLETED)

 

posted @ 2022-09-09 17:23  射满东城湖  阅读(72)  评论(0)    收藏  举报