多线程爬虫

from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import requests
import re
import time
import pandas as pd
import math
import time
from multiprocessing.dummy import Pool

def get_soup(url):
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'}
    req = urllib.request.Request(url=url, headers=headers, method='GET')
    response = urllib.request.urlopen(req)
    text = response.read().decode('utf-8')
    soup=BeautifulSoup(text,"html.parser")
    return soup

def fenleipa(label, n, url):
    res = ''
    for i in range(int(math.ceil(n/20))):
        soup=get_soup(url + 'page/'+ str(i+1))
        items = soup.find_all("div", attrs={'class':'feature-box clearfix'})
        for t in items:
            soup2 = get_soup(t.div.a.attrs['href'])
            post = soup2.find("div", attrs={'class':'posts clearfix'}).find("ul", class_=False)
            for p in post.find_all("li"):
                txt = p.text
                href = re.findall(r'https?://[a-zA-Z0-9./-]+', txt)[0]
                title = re.sub(href, '', txt)
                title = re.sub(r'\s', '', title)
                res += label + ' ' + title + ' ' + href + '\n'
    return res

def summary_process(a):
    start = time.perf_counter()
    x,y,z = re.split( r'(?: \(|个项目\))', a.attrs['aria-label'])
    y = int(re.sub(',', '', y))
    res = fenleipa(x, y, a.attrs['href'])
    with open('输出.txt', 'a') as f:
        f.write(res)
    end = time.perf_counter()
    print('label：%s已完成，花费时间：%d'%(x, end-start))

soup = get_soup('') //网址
summary = soup.find_all("a", attrs={'class':re.compile(r'tag-cloud-link tag-link-\d+ tag-link-position-\d+')})
pool = Pool(3)
pool.map(summary_process, summary)
posted @ 2022-10-26 15:32 徐钏阅读(25) 评论(0) 收藏举报
刷新页面返回顶部
徐钏

多线程爬虫

公告