python bs4
1 bs4
from fake_useragent import UserAgent import random import time import queue import datetime import requests import threading import sql_souhu from bs4 import BeautifulSoup url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E5%AE%89%E5%85%A8%E5%AE%A2&ie=utf8&_sug_=n&_sug_type_=' headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36", "Cookie": "ABTEST=0|1646737070|v1; SNUID=8A645D13C1C418731BDB726DC1940F5C; SUID=4AA49DD35B0CA00A00000000622736AE; SUID=4AA49DD3C830A40A00000000622736AE; JSESSIONID=aaaO15ZXqJWL-9EGHUe9x; SUV=00726B21D39DA44A622736AFC9300023; weixinIndexVisited=1; IPLOC=CN2110; ariaDefaultTheme=undefined", # "origin":" https://weixin.sogou.com", } o, n = 0, 0 class Souhu_req_wx(): def __init__(self): self.db=sql_souhu self.myqueue=self.get_url() def get_url(self): myqueue = queue.Queue() myqueue.put('https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E5%AE%89%E5%85%A8%E5%AE%A2&ie=utf8&_sug_=n&_sug_type_=') return myqueue def random_sleep(self): return time.sleep(random.randint(8, 15)) def get_random_UA(self): lst = [] for i in range(5): ua = UserAgent(verify_ssl=False) lst.append(ua.random) return random.sample(lst, 1)[0] def get_header(self): headers['User-Agent'] = self.get_random_UA() headers['Cookie'] = str(random.randint(1, 5000000)) return headers # 处理搜索后的数据 def official_account_info(self,res_data,offic_name): print(res_data) soup = BeautifulSoup(res_data.text, features='lxml') # class =news-list2 table_ul=soup.find('ul',class_="news-list2").find_all('li') print(len(table_ul)) for i in table_ul: name=i.find('div',class_='txt-box').find('a').text if name==offic_name: activate_data=i.find_all('dl')[2].find('a') title = activate_data.text activete_url = activate_data['href'] print('title',title) print('activete_url',activete_url) return def serch_official_account(self,urlquery): self.random_sleep() while not urlquery.empty(): url = urlquery.get() print(url) global o, n try: res = requests.get(url=url, headers=self.get_header(), timeout=20) res.encoding = 'utf-8' if "微信扫一扫" in res.text: self.official_account_info(res,'安全客') o += 1 else: n += 1 except BaseException: print('异常') def weihai_Therad_(self): threadlist = [] for x in range(0, 1): # 线程数 th = threading.Thread(target=self.serch_official_account, args=(self.myqueue,)) threadlist.append(th) for t in threadlist: t.start() for t in threadlist: t.join() def start(self): start=datetime.datetime.now() self.weihai_Therad_() print(datetime.datetime.now() - start) print('耗时') souhu_serch=Souhu_req_wx() souhu_serch.start()
soup = BeautifulSoup(res_data.text, features='lxml') table_ul=soup.find('ul',class_="news-list2").find_all('li') name=i.find('div',class_='txt-box').find('a').text activate_data=i.find_all('dl')[2].find('a') title = activate_data.text activete_url = activate_data['href'] print('title',title) print('activete_url',activete_url)