python bs4
1 bs4
from fake_useragent import UserAgent
import random
import time
import queue
import datetime
import requests
import threading
import sql_souhu
from bs4 import BeautifulSoup
url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E5%AE%89%E5%85%A8%E5%AE%A2&ie=utf8&_sug_=n&_sug_type_='
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
"Cookie": "ABTEST=0|1646737070|v1; SNUID=8A645D13C1C418731BDB726DC1940F5C; SUID=4AA49DD35B0CA00A00000000622736AE; SUID=4AA49DD3C830A40A00000000622736AE; JSESSIONID=aaaO15ZXqJWL-9EGHUe9x; SUV=00726B21D39DA44A622736AFC9300023; weixinIndexVisited=1; IPLOC=CN2110; ariaDefaultTheme=undefined",
# "origin":" https://weixin.sogou.com",
}
o, n = 0, 0
class Souhu_req_wx():
def __init__(self):
self.db=sql_souhu
self.myqueue=self.get_url()
def get_url(self):
myqueue = queue.Queue()
myqueue.put('https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E5%AE%89%E5%85%A8%E5%AE%A2&ie=utf8&_sug_=n&_sug_type_=')
return myqueue
def random_sleep(self):
return
time.sleep(random.randint(8, 15))
def get_random_UA(self):
lst = []
for i in range(5):
ua = UserAgent(verify_ssl=False)
lst.append(ua.random)
return random.sample(lst, 1)[0]
def get_header(self):
headers['User-Agent'] = self.get_random_UA()
headers['Cookie'] = str(random.randint(1, 5000000))
return headers
# 处理搜索后的数据
def official_account_info(self,res_data,offic_name):
print(res_data)
soup = BeautifulSoup(res_data.text, features='lxml')
# class =news-list2
table_ul=soup.find('ul',class_="news-list2").find_all('li')
print(len(table_ul))
for i in table_ul:
name=i.find('div',class_='txt-box').find('a').text
if name==offic_name:
activate_data=i.find_all('dl')[2].find('a')
title = activate_data.text
activete_url = activate_data['href']
print('title',title)
print('activete_url',activete_url)
return
def serch_official_account(self,urlquery):
self.random_sleep()
while not urlquery.empty():
url = urlquery.get()
print(url)
global o, n
try:
res = requests.get(url=url, headers=self.get_header(), timeout=20)
res.encoding = 'utf-8'
if "微信扫一扫" in res.text:
self.official_account_info(res,'安全客')
o += 1
else:
n += 1
except BaseException:
print('异常')
def weihai_Therad_(self):
threadlist = []
for x in range(0, 1): # 线程数
th = threading.Thread(target=self.serch_official_account, args=(self.myqueue,))
threadlist.append(th)
for t in threadlist:
t.start()
for t in threadlist:
t.join()
def start(self):
start=datetime.datetime.now()
self.weihai_Therad_()
print(datetime.datetime.now() - start)
print('耗时')
souhu_serch=Souhu_req_wx()
souhu_serch.start()
soup = BeautifulSoup(res_data.text, features='lxml')
table_ul=soup.find('ul',class_="news-list2").find_all('li')
name=i.find('div',class_='txt-box').find('a').text
activate_data=i.find_all('dl')[2].find('a')
title = activate_data.text
activete_url = activate_data['href']
print('title',title)
print('activete_url',activete_url)

浙公网安备 33010602011771号