爬虫之线程池concurrent.futures的使用

线程池爬取梨视频简单示例：

#爬取梨视频数据
import requests
import re
from lxml import etree
from multiprocessing.dummy import Pool
import random

def getVideoData(url):
    return requests.get(url=url,headers=headers).content

def saveVideo(data):
    fileName = str(random.randint(0,5000))+'.mp4'
    with open(fileName,'wb') as fp:
        fp.write(data)
        print('%s download success \n' % fileName)


#实例化一个线程池对象
pool = Pool(5)
url = 'https://www.pearvideo.com/category_1'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//div[@id="listvideoList"]/ul/li')

video_url_list = []
for li in li_list:
    detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
    detail_page = requests.get(url=detail_url,headers=headers).text
    video_url = re.findall('srcUrl="(.*?)",vdoUrl',detail_page,re.S)[0]
    video_url_list.append(video_url)
    
video_data_list = pool.map(getVideoData,video_url_list)

pool.map(saveVideo,video_data_list)

线程池爬取梨视频数据简单示例

进程池，线程池concurrent.futures

1.concurrent.futures模块是用来创建并行的任务，提供了更高级别的接口， 为了异步执行调用
2.concurrent.futures这个模块用起来非常方便，它的接口也封装的非常简单
3.concurrent.futures模块既可以实现进程池，也可以实现线程池
4.模块导入进程池和线程池
          from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
          p = ProcessPoolExecutor(max_works)对于进程池如果不写max_works：默认的是cpu的数目
          p = ThreadPoolExecutor(max_works)对于线程池如果不写max_works：默认的是cpu的数目*5

1、submit(fn, *args, **kwargs)
异步提交任务
 
2、map(func, *iterables, timeout=None, chunksize=1)
取代for循环submit的操作
 
3、shutdown(wait=True)
相当于进程池的pool.close()+pool.join()操作
wait=True，等待池内所有任务执行完毕回收完资源后才继续
wait=False，立即返回，并不会等待池内的任务执行完毕
但不管wait参数为何值，整个程序都会等到所有任务执行完毕
submit和map必须在shutdown之前
 
4、result(timeout=None)
取得结果
 
5、add_done_callback(fn)
回调函数

基本方法

from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
from threading import currentThread
import os,time,random
 
 
def task(n):
    print("%s is running " % os.getpid())
    time.sleep(random.randint(1,3))
    return n*2
 
if __name__ == '__main__':
    start = time.time()
    executor = ProcessPoolExecutor(4)
 
    res = []
    for i in range(10):  # 开启10个任务
        future = executor.submit(task,i)  # 异步提交任务
        res.append(future)
 
    executor.shutdown()  # 等待所有进程执行完毕
    print("++++>")
    for r in res:
        print(r.result())  # 打印结果
 
    end = time.time()
    print(end - start)
 
---------------------输出
2464 is running
9356 is running
10780 is running
9180 is running
2464 is running
10780 is running
9180 is running
9356 is running
10780 is running
9180 is running
++++>
0
2
4
6
8
10
12
14
16
18
6.643380165100098

进程池

from concurrent.futures import ProcessPoolExecutor,ThreadPoolExecutor
from threading import currentThread
import os,time,random
 
 
def task(n):
    print("%s is running " % currentThread().getName())
    time.sleep(random.randint(1,3))
    return n*2
 
if __name__ == '__main__':
    start = time.time()
    executor = ThreadPoolExecutor(4)  # 线程池
 
    res = []
    for i in range(10):  # 开启10个任务
        future = executor.submit(task,i)  # 异步提交任务
        res.append(future)
 
    executor.shutdown()  # 等待所有线程执行完毕
    print("++++>")
    for r in res:
        print(r.result())  # 打印结果
 
    end = time.time()
    print(end - start)
 
------------输出
 
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_0 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_1 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_2 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_3 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_3 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_1 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_0 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_2 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_3 is running
<concurrent.futures.thread.ThreadPoolExecutor object at 0x00000000025B0DA0>_1 is running
++++>
0
2
4
6
8
10
12
14
16
18
5.002286195755005

线程池

import requests
import time
from concurrent.futures import ThreadPoolExecutor
 
def get(url):
    print('GET {}'.format(url))
    response = requests.get(url)
    time.sleep(2)
    if response.status_code == 200:  # 200代表状态：下载成功了
        return {'url': url, 'content': response.text}
 
def parse(res):
    print('%s parse res is %s' % (res['url'], len(res['content'])))
    return '%s parse res is %s' % (res['url'], len(res['content']))
 
def save(res):
    print('save', res)
 
def task(res):
    res = res.result()
    par_res = parse(res)
    save(par_res)
 
 
if __name__ == '__main__':
    urls = [
            'http://www.cnblogs.com/linhaifeng',
            'https://www.python.org',
            'https://www.openstack.org',
        ]
 
    pool = ThreadPoolExecutor(2)
    for i in urls:
        pool.submit(get, i).add_done_callback(task)#这里的回调函数拿到的是一个对象。得
        #  先把返回的res得到一个结果。即在前面加上一个res.result() #谁好了谁去掉回调函数
                                # 回调函数也是一种编程思想。不仅开线程池用，开线程池也用
    pool.shutdown()  #相当于进程池里的close和join
 
-------------输出
GET http://www.cnblogs.com/linhaifeng
GET https://www.python.org
http://www.cnblogs.com/linhaifeng parse res is 17426
save http://www.cnblogs.com/linhaifeng parse res is 17426
GET https://www.openstack.org
https://www.python.org parse res is 48809
save https://www.python.org parse res is 48809
https://www.openstack.org parse res is 60632
save https://www.openstack.org parse res is 60632

回调函数

import requests
import time
from concurrent.futures import ThreadPoolExecutor
 
def get(url):
    print('GET {}'.format(url))
    response = requests.get(url)
    time.sleep(2)
    if response.status_code == 200:  # 200代表状态：下载成功了
        return {'url': url, 'content_len': len(response.text)}
 
 
 
if __name__ == '__main__':
    urls = [
            'http://www.cnblogs.com/linhaifeng',
            'https://www.python.org',
            'https://www.openstack.org',
        ]
 
    pool = ThreadPoolExecutor(2)
    res = pool.map(get, urls) #map取代了for+submit
 
    pool.shutdown()   # 相当于进程池里的close和join
    print('=' * 30)
    for r in res: # 返回的是一个迭代器
        print(r)
 
GET http://www.cnblogs.com/linhaifeng
GET https://www.python.org
GET https://www.openstack.org
{'url': 'http://www.cnblogs.com/linhaifeng', 'content_len': 17426}
{'url': 'https://www.python.org', 'content_len': 48809}
{'url': 'https://www.openstack.org', 'content_len': 60632}

map

from threading import Thread, currentThread
import time
import queue
 
 
class MyThread(Thread):
 
    def __init__(self, queue):
        super().__init__()
        self.queue = queue
        self.daemon = True  # 子线程跟着主线程一起退出
        self.start()
 
    def run(self):
        """
                1、让他始终去运行，
                2、去获取queue里面的任务，
                 3、然后给任务分配函数去执行（获取任务在执行）
                :return:
                """
        while True:
            func, args, kwargs = self.queue.get()  # 从队列中获取任务
            func(*args, **kwargs)
            self.queue.task_done()  # 计数器  执行完这个任务后  （队列-1操作)
 
 
class MyPool(object):
    """
    在任务来到之前，提前创建好线程，等待任务
    """
 
    def __init__(self, num):  # 线程数量
        self.num = num
        self.queue = queue.Queue()
        for i in range(self.num):
            MyThread(self.queue)
 
    def submit(self, func, args=(), kwargs={}):
        self.queue.put((func, args, kwargs))
 
    def join(self):
        self.queue.join()  # 等待队列里面的任务处理完毕
 
 
def task(i):
    print(currentThread().getName(), i)
    time.sleep(2)
 
 
if __name__ == '__main__':
    start = time.time()
    pool = MyPool(3)  # 实例化一个线程池
    for i in range(4):
        pool.submit(task, args=(i,))
    pool.join()
    print('运行的时间{}秒'.format(time.time() - start))

自定义线程池

posted on 2020-01-19 14:09 始终不够啊阅读(313) 评论(0) 收藏举报

刷新页面返回顶部

爬虫之线程池concurrent.futures的使用

进程池，线程池concurrent.futures

公告