python并发

1.并发

引入并发是为了提升程序运行速度，常用并发方式包括：单线程串行，多线程并发，多cpu并行，多机器并行。

线程是 CPU 调度的基本单位。单核 CPU 同一时刻通常只能运行一个线程；多核 CPU 同一时刻可以运行多个线程。

单核 CPU 通过快速切换执行对象，可以实现多进程、多线程、多协程的并发效果。但这不叫并行，多核cpu才可以并行。

并发包括：

多进程：multiprocessing，开多个进程，每个进程互相独立，可以分配到不同 CPU 核心上运行，适合 CPU 密集型任务，能实现真正的并行。
多线程：threading，在一个进程里开多个线程，一个线程等待 I/O 时，其他线程可以继续执行，所以适合 I/O 密集型任务。
异步IO（多协程）：asyncio，在单线程里，通过 async/await 在任务等待 I/O 时主动切换去执行别的任务，也适合 I/O 密集型任务。

常用函数包括：

使用Lock锁对公共资源加锁，防止访问冲突
使用Queue实现不同进程/线程之间的通信，实现生产者-消费者模式
使用进程池Pool/线程池Pool，简化进程/线程的任务提交，等待结束，获取结果
使用subprocess启动外部程序的进程，并进行输入输出的交互

CPU密集型计算：也叫计算密集型，是指IO在很短的时间内就能完成，CPU需要大量的计算和处理，特点是CPU占用率很高。例如压缩解压缩，加解密，正则表达式搜索。

IO密集型计算：是指大部分时间CPU在等待IO的读写操作，CPU的占有率较低。例如文件处理，爬虫，读写数据库程序。

一个进程包括多个线程，一个线程包括多个协程。

多进程，多线程，多协程的对比：

多进程（process）：优点：可以利用多核cpu并行执行；缺点：占用资源最多，可启动数目比线程少；适用于cpu密集型计算。
多线程（thread）：优点：相比进程更轻量，占用资源更少；缺点：相比进程，多线程只能并发执行，不能利用多cpu（GIL），相比协程，启动数目有限制，占用内存资源，有线程切换开销；适用于：IO密集型计算，同时运行的任务数目要求不多。
多协程（Coroutine）：优点：内存开销最小，启动协程数量最多；缺点：支持的库有限制（aiohttp vs requests），代码实现复杂；适用于：IO密集型计算，需要超多任务运行，但有现成库支持的场景。

补充：在 GIL 下，多线程不能很好地并行执行 Python 字节码，不是因为多个 CPU 不能同时处理同一进程里的多个线程，而是因为这些线程要竞争同一个 GIL，所以在 CPU 密集型场景下，多线程通常只有并发效果，而没有很好的并行效果。

全局解释器GIL

python执行速度慢的两个原因：①动态类型语言，边解释边执行，②由于GIL无法使用多核cpu并发执行
全局解释器锁（GIL）是用于同步进程的一种机制，它使得任何时刻只有一个进程在执行，即使在多核处理器中，使用GIL解释器也只允许同一时间内执行一个线程

不加GIL导致的问题：

线程B释放了obj，如果A再释放，会破坏其他内存
规避GIL问题:

当有IO密集型计算时可以采用多线程，其他情况可以考虑多进程（在多核cpu前提下）

多线程爬取

#blog_spider.py

import requests

urls=[
    f"https://www.cnblogs.com/#p{page}"
    for page in range(1,50+1)
]

def craw(url):
    r=requests.get(url)
    print(url,len(r.text))

craw(urls[0])

#01.multi_thread_craw.py

import blog_spider
import threading
import time

#多线程爬取
def single_thread():
    print("single_thread begin")
    for url in blog_spider.urls:
        blog_spider.craw(url)
    print("single_thread end")

#多线程爬取
def multi_thread():
    print("multi_thread begin")
    threads=[]
    #创建一个线程列表threads
    for url in blog_spider.urls:
        threads.append(
            threading.Thread(target=blog_spider.craw,args=(url,))
        )
    #通过循环实现线程并发
    for thread in threads:
        thread.start()
    #等待所有线程执行完毕
    for thread in threads:
        thread.join()

    print("multi_thread end")

if __name__=="__main__":
    start=time.time()
    single_thread()
    end=time.time()
    print("single thread cost:",end-start,"s")

    start=time.time()
    multi_thread()
    end=time.time()
    print("multi thread cost:",end-start,"s")

生产者-消费者模型

#blog_spider.py

import requests
from bs4 import BeautifulSoup

urls = [
    "https://www.cnblogs.com/" if page == 1
    else f"https://www.cnblogs.com/sitehome/p/{page}"
    for page in range(1, 51)
]

#生产者
def craw(url):
    r=requests.get(url)
    return r.text

def parse(html):
    #class ="post-item-title"
    #创建一个 BeautifulSoup 对象，它可以使用各种方法来查找和提取 HTML 或 XML 文档中的数据
    soup=BeautifulSoup(html,"html.parser")
    #如果要查找所有具有 class=“title” 的 a 元素，要使用class_=
    links=soup.find_all("a",class_="post-item-title")
    #一个包含url和标题的元组作为列表返回
    return [(link["href"],link.get_text()) for link in links]

if __name__=="__main__":
    #访问第四页内容
    for result in parse(craw(urls[3])):
        print(result)

#02.producer_consumer_spider.py

import queue
import blog_spider
import time
import random
import threading

#生产者
def do_craw(url_queue:queue.Queue,html_queue:queue.Queue):
    while True:
        #从url队列获取第一个url
        url=url_queue.get()
        #获取url对应的html内容
        html=blog_spider.craw(url)
        #将html放入html队列
        html_queue.put(html)
        print(threading.current_thread().name,f"craw{url}",
              "url_queue.size=",url_queue.qsize())
        time.sleep(random.randint(1,2))

#消费者
def do_parse(html_queue:queue.Queue,fout):
    while True:
        #从html队列中取出html
        html=html_queue.get()
        #执行parse找出html的url和标题
        results=blog_spider.parse(html)
        for result in results:
            fout.write(str(result)+"\n")
        print(threading.current_thread().name,f"results.size",
              len(results),"html_queue.size=",html_queue.qsize())
        time.sleep(random.randint(1,2))


if __name__=="__main__":
    url_queue=queue.Queue()
    html_queue=queue.Queue()

    #将url放入url_queue
    for url in blog_spider.urls:
        url_queue.put(url)

    #多线程并发执行do_craw
    for idx in range(3):
        t=threading.Thread(target=do_craw,args=(url_queue,html_queue),name=f"craw{idx}")
        t.start()

    #多线程并发执行do_parse
    fout=open("data.txt","w",encoding="utf-8")
    for idx in range(3):
        t=threading.Thread(target=do_parse,args=(html_queue,fout),name=f"parse{idx}")
        t.start()

线程安全问题及解决方法

线程安全指某个函数，函数库在多线程中被调用时，能够正常处理多个线程之间的共享变量，使程序功能正常完成
由于线程之间随时切换，造成了不可预料的后果，出现线程不安全

#03.lock_concurrent.py

import threading
import time

lock=threading.Lock()
class Account:
    def __init__(self,balance):
        self.balance=balance

def draw(account,amount):
    with lock:
        if account.balance>=amount:
            print(threading.current_thread().name,"取钱成功")
            account.balance-=amount
            print(threading.current_thread().name, "余额",account.balance)
        else:
            print(threading.current_thread().name, "取钱失败，余额不足")

if __name__=="__main__":
    #Account对象account
    account=Account(1000)
    #两个线程ta,tb
    ta=threading.Thread(name="ta",target=draw,args=(account,800))
    tb = threading.Thread(name="tb", target=draw, args=(account, 600))

    ta.start()
    tb.start()

加锁后，其运行速度与单线程就没什么区别了

线程池

线程池原理

使用线程池好处

两种线程池使用方法

#04.thread_pool.py

import concurrent.futures
import blog_spider

#craw，创建一个线程池
with concurrent.futures.ThreadPoolExecutor() as pool:
    #把 blog_spider.urls 里的每个 URL 都交给 blog_spider.craw 去处理
    #虽然是并发抓取，但 pool.map() 返回结果时会保持和 urls 一样的顺序，不是谁先抓完就先返回谁。
    htmls=pool.map(blog_spider.craw,blog_spider.urls)
    #把每个 url 和对应的 html 配对
    htmls=list(zip(blog_spider.urls,htmls))
    for url,html in htmls:
        print(url,len(html))

print("craw over")

#parse，创建第二个线程池
with concurrent.futures.ThreadPoolExecutor() as pool:
    futures={}
    for url,html in htmls:
        #把每个 html 提交给 blog_spider.parse，future.result会装着 parse(html) 的结果
        future=pool.submit(blog_spider.parse,html)
        #记录这个 future 是从这个 url 来的
        futures[future]=url

    #输出方法一，futures.item()，按原始顺序看结果
    for future,url in futures.items():
        print(url,future.result())

    #输出方法二，concurrent.futures.as_completed(futures)，谁先执行完谁先输出
    for future in concurrent.futures.as_completed(futures):
        url=futures[future]
        print(url,future.result())

Web服务中线程池加速（IO密集型）

web服务架构和特点

使用线程池加速的好处

#05.flask_thread_pool.py

import flask
import json
import time
from concurrent.futures import ThreadPoolExecutor

app=flask.Flask(__name__)
pool=ThreadPoolExecutor()

def read_file():
    time.sleep(0.1)
    return "file result"

def read_db():
    time.sleep(0.2)
    return "db result"

def read_api():
    time.sleep(0.3)
    return "api result"

@app.route("/")
def index():
    result_file=pool.submit(read_file)
    result_db = pool.submit(read_db)
    result_api= pool.submit(read_api)

    return json.dumps({
        "result_file":result_file.result(),
        "result_db": result_db.result(),
        "result_api": result_api.result()
    })

if __name__=="__main__":
    app.run()

多进程加速程序运行

如果遇到cpu密集型，使用多线程反而会降低速度，因为增加了一些额外开销，而且多线程并未起到明显作用

图中上方是IO密集型，下面是CPU密集型，上面即使有GIL，也能很好的提高效率，但是下面有GIL的情况下，反而会降低速度

#06.thread_process_cpu_bound.py
import math
import time
from concurrent.futures import ThreadPoolExecutor,ProcessPoolExecutor

PRIMES=[112272535095293]*100

def is_prime(n):
    if n<2:
        return False
    if n==2:
        return True
    if n%2==0:
        return False
    sqrt_n=int(math.floor(math.sqrt(n)))
    for i in range(3,sqrt_n+1,2):
        if n%i==0:
            return False
    return True

#单线程
def single_thread():
    for number in PRIMES:
        is_prime(number)

#多线程
def multi_thread():
    with ThreadPoolExecutor() as pool:
        pool.map(is_prime,PRIMES)

#多进程
def multi_process():
    with ProcessPoolExecutor() as pool:
        pool.map(is_prime,PRIMES)

if __name__=="__main__":
    start=time.time()
    single_thread()
    end=time.time()
    print("single_thread,cost:",end-start,"seconds")

    start=time.time()
    multi_thread()
    end=time.time()
    print("multi_thread,cost:",end-start,"seconds")

    start=time.time()
    multi_process()
    end=time.time()
    print("multi_process,cost:",end-start,"seconds")

Web服务中进程池加速（cpu密集型）

这里引入了IO操作，但CPU操作占绝大部分

#07.flask_process_pool.py

import flask
import math
import json
from concurrent.futures import ProcessPoolExecutor

app=flask.Flask(__name__)

def is_prime(n):
    if n<2:
        return False
    if n==2:
        return True
    if n%2==0:
        return False
    sqrt_n=int(math.floor(math.sqrt(n)))
    for i in range(3,sqrt_n+1,2):
        if n%i==0:
            return False
    return True

@app.route("/is_prime/<numbers>")
def api_is_prime(numbers):
    number_list=[int(x) for x in numbers.split(",")]
    results=process_pool.map(is_prime,number_list)
    return json.dumps(dict(zip(number_list,results)))

if __name__=="__main__":
    #进程池初始化要写在main函数中，与线程池的区别之一
    process_pool = ProcessPoolExecutor()
    app.run()

参考：
https://www.bilibili.com/video/BV1bK411A7tV/?spm_id_from=333.788.videopod.episodes&vd_source=81bfbf91cfd2d9c7f25ac8785b6c8670

posted on 2026-06-02 16:10 漫思阅读(5) 评论(0) 收藏举报

刷新页面返回顶部