python bs4

1 bs4 

from fake_useragent import UserAgent
import random
import time
import queue
import datetime
import requests
import threading
import sql_souhu

from  bs4 import  BeautifulSoup
url = 'https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E5%AE%89%E5%85%A8%E5%AE%A2&ie=utf8&_sug_=n&_sug_type_='

headers = {

    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",

    "Cookie": "ABTEST=0|1646737070|v1; SNUID=8A645D13C1C418731BDB726DC1940F5C; SUID=4AA49DD35B0CA00A00000000622736AE; SUID=4AA49DD3C830A40A00000000622736AE; JSESSIONID=aaaO15ZXqJWL-9EGHUe9x; SUV=00726B21D39DA44A622736AFC9300023; weixinIndexVisited=1; IPLOC=CN2110; ariaDefaultTheme=undefined",
    # "origin":" https://weixin.sogou.com",
}
o, n = 0, 0


class Souhu_req_wx():
    def __init__(self):
        self.db=sql_souhu

        self.myqueue=self.get_url()
    def get_url(self):
        myqueue = queue.Queue()
        myqueue.put('https://weixin.sogou.com/weixin?type=1&s_from=input&query=%E5%AE%89%E5%85%A8%E5%AE%A2&ie=utf8&_sug_=n&_sug_type_=')
        return myqueue
    def random_sleep(self):
        return
        time.sleep(random.randint(8, 15))

    def get_random_UA(self):
        lst = []
        for i in range(5):
            ua = UserAgent(verify_ssl=False)
            lst.append(ua.random)
        return random.sample(lst, 1)[0]

    def get_header(self):
        headers['User-Agent'] = self.get_random_UA()
        headers['Cookie'] = str(random.randint(1, 5000000))
        return headers
    # 处理搜索后的数据
    def official_account_info(self,res_data,offic_name):
        print(res_data)
        soup = BeautifulSoup(res_data.text, features='lxml')
        # class =news-list2
        table_ul=soup.find('ul',class_="news-list2").find_all('li')
        print(len(table_ul))
        for i in table_ul:
            name=i.find('div',class_='txt-box').find('a').text
            if name==offic_name:
                activate_data=i.find_all('dl')[2].find('a')
                title = activate_data.text
                activete_url = activate_data['href']
                print('title',title)
                print('activete_url',activete_url)
        
        return
    def serch_official_account(self,urlquery):
        self.random_sleep()
        while not urlquery.empty():
            url = urlquery.get()
            print(url)
            global o, n
            try:
                res = requests.get(url=url, headers=self.get_header(), timeout=20)
                res.encoding = 'utf-8'
                if "微信扫一扫" in res.text:
                    self.official_account_info(res,'安全客')
                    o += 1
                else:
                    n += 1
            except BaseException:
                print('异常')
    def weihai_Therad_(self):
        threadlist = []
        for x in range(0, 1):  # 线程数
            th = threading.Thread(target=self.serch_official_account, args=(self.myqueue,))
            threadlist.append(th)
        for t in threadlist:
            t.start()
        for t in threadlist:
            t.join()
    def start(self):
        start=datetime.datetime.now()
        self.weihai_Therad_()
        print(datetime.datetime.now() - start)
        print('耗时')

souhu_serch=Souhu_req_wx()
souhu_serch.start()

  

soup = BeautifulSoup(res_data.text, features='lxml')

table_ul=soup.find('ul',class_="news-list2").find_all('li')

name=i.find('div',class_='txt-box').find('a').text


 activate_data=i.find_all('dl')[2].find('a')
                title = activate_data.text
                activete_url = activate_data['href']
                print('title',title)
                print('activete_url',activete_url)

  

 

posted @ 2022-03-09 15:59  睁yan-ii  阅读(46)  评论(0编辑  收藏  举报