osnosn

  博客园 :: 首页 :: 博问 :: 闪存 :: :: 联系 :: 订阅 订阅 :: 管理 ::

python3.6_多进程_multiprocessing.pool_concurrent.futures_ProcessPoolExecutor_对比

转载注明来源: 本文链接 来自osnosn的博客,写于 2020-06-27.

多进程的多种写法,在大量任务的情况下,效率的对比。

(后面有两个例程参考)

import time
from multiprocessing.pool import Pool
from concurrent.futures import as_completed, ProcessPoolExecutor

NUMBERS = range(1, 60000)
K = 50

def f(x):
    r = 0
    for k in range(1, K+2):
        r += x ** (1 / k**1.5)
    return ['xx',r]

if __name__ == '__main__':
    if 1:
        print('-------------------\n no multiProcessing:')
        start = time.time()
        l = []
        for nn in NUMBERS:
            result=f(nn)
            l.append(result)
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n multiprocessing.pool.Pool:')
        start = time.time()
        l = []
        pool = Pool(4)
        for num, result in zip(NUMBERS, pool.map(f, NUMBERS)):
            l.append(result)
        pool.close()
        pool.terminate()
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n multiprocessing.pool.Pool, apply_async:')
        start = time.time()
        l = []
        pool = Pool(4)
        res=[]
        for nn in NUMBERS:
            res.append(pool.apply_async(f,(nn,)))
        pool.close()
        print('middle COST: {}'.format(time.time() - start))
        pool.join()
        for rr in res:
            l.append(rr.get())
        pool.terminate()
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n multiprocessing.pool.Pool, apply_async,maxtasksperchild=1000 :')
        start = time.time()
        l = []
        pool = Pool(4,maxtasksperchild=1000)
        res=[]
        for nn in NUMBERS:
            res.append(pool.apply_async(f,(nn,)))
        pool.close()
        print('middle COST: {}'.format(time.time() - start))
        pool.join()
        for rr in res:
            l.append(rr.get())
        pool.terminate()
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n ProcessPoolExecutor with chunksize,1/4:')
        start = time.time()
        l = []
        with ProcessPoolExecutor(max_workers=4) as executor:
            chunksize, extra = divmod(len(NUMBERS), executor._max_workers * 4)
            print('chunksize',chunksize)
            for num, result in zip(NUMBERS, executor.map(f, NUMBERS, chunksize=chunksize)):
                l.append(result)
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n ProcessPoolExecutor with chunksize,1/10:')
        start = time.time()
        l = []
        with ProcessPoolExecutor(max_workers=4) as executor:
            chunksize, extra = divmod(len(NUMBERS), executor._max_workers * 10)
            print('chunksize',chunksize)
            for num, result in zip(NUMBERS, executor.map(f, NUMBERS, chunksize=chunksize)):
                l.append(result)
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n ProcessPoolExecutor with chunksize,1/100:')
        start = time.time()
        l = []
        with ProcessPoolExecutor(max_workers=4) as executor:
            chunksize, extra = divmod(len(NUMBERS), executor._max_workers * 100)
            print('chunksize',chunksize)
            for num, result in zip(NUMBERS, executor.map(f, NUMBERS, chunksize=chunksize)):
                l.append(result)
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n ProcessPoolExecutor with chunksize,1/300:')
        start = time.time()
        l = []
        with ProcessPoolExecutor(max_workers=4) as executor:
            chunksize, extra = divmod(len(NUMBERS), executor._max_workers * 300)
            print('chunksize',chunksize)
            for num, result in zip(NUMBERS, executor.map(f, NUMBERS, chunksize=chunksize)):
                l.append(result)
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n ProcessPoolExecutor with chunksize,500:')
        start = time.time()
        l = []
        with ProcessPoolExecutor(max_workers=4) as executor:
            chunksize=500
            print('chunksize',chunksize)
            for num, result in zip(NUMBERS, executor.map(f, NUMBERS, chunksize=chunksize)):
                l.append(result)
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n ProcessPoolExecutor submit:')
        start = time.time()
        pool_res=[]
        executor=ProcessPoolExecutor(max_workers=4)
        for nn in NUMBERS:
            res=executor.submit(f,nn)
            pool_res.append(res)
        print('middle COST: {}'.format(time.time() - start))
        l = []
        for p_res in as_completed(pool_res):
            result=p_res.result()
            l.append(result)
        executor.shutdown()
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))
    if 1:
        print('-------------------\n ProcessPoolExecutor without chunksize:')
        start = time.time()
        l = []
        with ProcessPoolExecutor(max_workers=4) as executor:
            for num, result in zip(NUMBERS, executor.map(f, NUMBERS)):
                l.append(result)
        print(len(l), l[0])
        print('COST: {}'.format(time.time() - start))

    print('')

结果:

-------------------
 no multiProcessing:
59999 ['xx', 51.0]
COST: 1.2773692607879639
-------------------
 multiprocessing.pool.Pool:
59999 ['xx', 51.0]
COST: 0.4585001468658447
-------------------
 multiprocessing.pool.Pool, apply_async:
middle COST: 1.492830514907837
59999 ['xx', 51.0]
COST: 4.116384267807007
-------------------
 multiprocessing.pool.Pool, apply_async,maxtasksperchild=1000 :
middle COST: 2.0289459228515625
59999 ['xx', 51.0]
COST: 5.032078266143799
-------------------
 ProcessPoolExecutor with chunksize,1/4:
chunksize 3749
59999 ['xx', 51.0]
COST: 0.4767882823944092
-------------------
 ProcessPoolExecutor with chunksize,1/10:
chunksize 1499
59999 ['xx', 51.0]
COST: 0.5644888877868652
-------------------
 ProcessPoolExecutor with chunksize,1/100:
chunksize 149
59999 ['xx', 51.0]
COST: 0.4668114185333252
-------------------
 ProcessPoolExecutor with chunksize,1/300:
chunksize 49
59999 ['xx', 51.0]
COST: 0.673607587814331
-------------------
 ProcessPoolExecutor with chunksize,500:
chunksize 500
59999 ['xx', 51.0]
COST: 0.45476365089416504
-------------------
 ProcessPoolExecutor submit:
middle COST: 11.38172698020935
59999 ['xx', 16145.294670113708]
COST: 21.179430723190308
-------------------
 ProcessPoolExecutor without chunksize:
59999 ['xx', 51.0]
COST: 20.61406421661377

区别还挺大, 测试机器cpu有6核。python3.6

参考:
使用Python进行并发编程-PoolExecutor篇
concurrent.futures


感觉用 pool.map() 并设置 chunksize 比较好

例子01

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
多进程例程,
队列一次全部准备好,一次启动子进程。
"""

import os
from concurrent.futures import ProcessPoolExecutor as Pool

def child_do(ii,fnn): #正真的子进程,可以接收多个参数
    #可以打开hdf5文件,可以print()
    print(ii,'child')
    #返回参数可以是dataframe,返回参数可以是多个结果
    res2=res3=res4= ii+2+fnn
    return [ii,res2,res3,res4] #此处返回4个结果

def child_one(param): #此函数只有一个参数
    result=child_do(* param) #把param展开,传给函数
    return result

def do_main():
    # do something...
    cpucount=os.cpu_count() #进程数=系统cpu个数
    if cpucount<3: cpucount=3
    cpucount=4 #指定进程数,(cpu太多,还是指定算了,上面2行其实没有用)

    pool=Pool(cpucount) #开启进程池,后面用pool.map()启动子进程
    pool_args=[] #传给子进程的参数队列

    ii=0 #计数
    for fnn in range(0,1115):
        ii+=1
        # 放入队列,子进程只能接受一个参数,所以使用tuple封装多个参数,用list,dict也可以
        # 参数中可以有dataframe, 传入的是引用.
        # 但要注意:多个进程同时修改同一个dataframe可能有问题
        pool_args.append((ii,fnn,) ) #(child_do接受两个参数)

    #子进程的参数是由主进程发送的。一次发送一个,效率很低,很耗时。所以一次就发送一堆。
    chunksize,_=divmod(len(pool_args),pool._max_workers * 30) #参数分30次传输
    if chunksize<50:
        chunksize=50  #太小就设50
    elif chunksize>200:
        chunksize=200 #太大就设200

    for p_result in pool.map(child_one, pool_args, chunksize=chunksize): #启动子进程,并等待返回
        jj,res2,res3,res4=p_result  # 获取子进程的返回数据(child_do返回了4个结果)
        print(jj,res2,res3,res4)
        # 汇总结果

    print('启动进程池',ii)
    pool.shutdown() #关闭进程池

    # 处理汇总的结果

if __name__=='__main__':
    do_main()

例子02

#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
多进程例程
队列准备多次,多次启动子进程。
"""

import os
from concurrent.futures import ProcessPoolExecutor as Pool
import time

def child_do(ii,fnn): #正真的子进程,可以接收多个参数
    #可以打开hdf5文件,可以print
    print(ii,'child')
    #返回参数可以是dataframe,返回参数可以是多个结果
    res2=res3=res4= ii+2+fnn
    return [ii,res2,res3,res4] #此处返回4个结果

def child_one(param): #此函数只有一个参数
    result=child_do(* param) #把param展开,传给函数
    return result

#启动进程池,并处理结果(此函数不会多进程同时执行)
def do_result(pool, pool_args ):
    #子进程的参数是由主进程发送的。一次发送一个,效率很低,很耗时。所以一次就发送一堆。
    chunksize=20 #实际使用可以设50或100
    for p_result in pool.map(child_one, pool_args, chunksize=chunksize): #启动进程池,并等待返回
        jj,res2,res3,res4=p_result  # 获取子进程的返回数据(child_do返回了4个结果)
        print(jj,res2,res3,res4)
        # 汇总结果
    return res2 #返回汇总结果

def do_main():
    # 先做一些准备工作

    cpucount=os.cpu_count() #进程数=系统cpu个数
    if cpucount<3: cpucount=3
    cpucount=4 #指定进程数,(cpu太多,还是指定算了,上面2行其实没有用)

    pool=Pool(cpucount) #开启进程池,后面用pool.map()启动子进程
    pool_args=[] #传给子进程的参数队列

    ii=0 #计数
    for fnn in range(0,1115):
        ii+=1
        # 放入队列,子进程只能接受一个参数,所以使用tuple封装多个参数,用list,dict也可以
        # 参数中可以有dataframe, 传入的是引用.
        # 但要注意:多个进程同时修改同一个dataframe可能有问题
        pool_args.append((ii,fnn,) ) #(child_do接受两个参数)

        if (ii % 200) == 0: #实际可设每1000个启动一次子进程池
            mysum=do_result(pool, pool_args)
            pool_args=[] #清空参数队列
            print('启动进程池',ii)
            time.sleep(0.5) #仅为了测试时暂停一下,方便查看输出

    #循环结束,再启动一次进程池,把剩余队列的处理掉
    mysum=do_result(pool, pool_args)
    print('启动进程池',ii)
    pool.shutdown() #关闭进程池

    # 处理汇总的结果

if __name__=='__main__':
    do_main()

转载注明来源: 本文链接 来自osnosn的博客.

posted on 2020-06-27 20:11  osnosn  阅读(1009)  评论(0编辑  收藏  举报