多线程版本的网页爬取

首先
多线程和多进程都是一样的格式,这个是python本身语言的封装优势
也就是说python中,多线程和多进程的调用方法几乎是一模一样的
就是把调用线程的函数换成了调用进程的函数

其次就是
调用线程和进程的函数库是python自带的原生库
不用自己去pip下载!!!!!
from concurrent.futures import ThreadPoolExecutor
直接导入即可

最后就是
线程和进程的调用很简单
就是创建一个线程池(进程池,此处以线程池来举例)

    with ThreadPoolExecutor(10) as x:
        for i in range(1,50):
            x.submit(down_1,html)

解释:
创建一个含有10个线程的线程池
每次循环,最高至50
调用函数,传入该函数的参数

以下为我们学校举例的完整代码

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor

def base_url(i):
    if (i == 1):
        url = 'https://www.qjnu.edu.cn/channels/9260.html'
    else:
        url='https://www.qjnu.edu.cn/channels/9260_'+str(i)+'.html'
    request = requests.get(url)
    request.encoding = 'utf-8'
    html = etree.HTML(request.text)
    return html

def down_1(html):
    name_list = html.xpath('//div[@class="media"]/h4/a/text()')
    url_list = html.xpath('//div[@class="media"]/h4/a/@href')
    for i in range(len(name_list)):
        if key in name_list[i]:
            with open('学校党员主题网址.txt', 'a', encoding='UTF-8') as fp:
                fp.write(url_list[i] + '\n')
            new_url = url_list[i]
            request = requests.get(new_url)
            request.encoding = 'utf-8'
            html2 = etree.HTML(request.text)
            tex_list = html2.xpath('//div[@class="field-item even"]//p/span/text()')
            name = name_list[i]
            with open(name + '.txt', 'w', encoding='UTF-8') as fp:
                fp.write(str(tex_list))

if __name__ == '__main__':
    page = int(input('请输入需要爬取的页数:'))
    key = str(input('请输入要查找的关键词:'))

    with ThreadPoolExecutor(10) as x:
        for i in range(1,page+1):
            html = base_url(i)
            x.submit(down_1,html)

修改版

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor

def base_url(i,url):
    if (url==[]):
        if (i == 1):
            url = 'https://www.qjnu.edu.cn/channels/9260.html'
        else:
            url='https://www.qjnu.edu.cn/channels/9260_'+str(i)+'.html'
    request = requests.get(url)
    request.encoding = 'utf-8'
    html = etree.HTML(request.text)
    return html

def down_1(html):
    name_list = html.xpath('//div[@class="media"]/h4/a/text()')
    url_list = html.xpath('//div[@class="media"]/h4/a/@href')
    for i in range(len(name_list)):
        if key in name_list[i]:
            with open('学校党员主题网址.txt', 'a', encoding='UTF-8') as fp:
                fp.write(url_list[i] + '\n')
            url = url_list[i]
            html2 = base_url(i,url)
            tex_list = html2.xpath('//div[@class="field-item even"]//p/span/text()')
            name = name_list[i]
            with open(name + '.txt', 'w', encoding='UTF-8') as fp:
                fp.write(str(tex_list))

if __name__ == '__main__':
    page = int(input('请输入需要爬取的页数:'))
    key = str(input('请输入要查找的关键词:'))
    with ThreadPoolExecutor(10) as x:
        for i in range(1,page+1):
            url = []
            html = base_url(i,url)
            x.submit(down_1,html)
posted @ 2022-06-14 22:13  皓_月  阅读(98)  评论(0)    收藏  举报