全部文章

爬虫-异步-超时重试、异常处理-高效代码-真的快十倍!!!

阅读本文之前,需要掌握爬虫Python异步编程的相关知识,拓展阅读:爬虫案例-对比单线程、多线程、多进程、异步编程的效率

import asyncio
import aiohttp
from aiohttp import ClientSession, ClientTimeout
import time
from urllib.parse import urlparse


# 异步请求函数(支持超时、重试和异常处理)
async def fetch_url(session: ClientSession, url: str, semaphore: asyncio.Semaphore, retries=3):
    try:
        async with semaphore:  # 控制并发量
            for attempt in range(retries):
                try:
                    async with session.get(url, timeout=ClientTimeout(total=10)) as response:
                        if response.status == 200:
                            html = await response.text()
                            print(f"成功获取 {url}")
                            return html
                        else:
                            print(f"请求失败: {url}, 状态码 {response.status}")
                except (aiohttp.ClientError, asyncio.TimeoutError) as e:
                    if attempt < retries - 1:
                        print(f"第 {attempt + 1} 次重试 {url}: {str(e)}")
                        await asyncio.sleep(1)
                    else:
                        print(f"请求失败(最终): {url}, 错误: {str(e)}")
                        return None
    except Exception as e:
        print(f"未知错误: {url}, {str(e)}")
        return None


# 解析 HTML(模拟数据处理)
# 解析和存储操作也异步执行,最大化利用 I/O 等待时间
async def parse_html(html: str):
    # 使用 BeautifulSoup 或其他库解析
    await asyncio.sleep(0.001)  # 模拟轻量处理
    return {"data": "parsed_result"}


# 主异步函数
async def main(urls: list, max_concurrency=100):
    start_time = time.time()
    timeout = ClientTimeout(total=10)
    semaphore = asyncio.Semaphore(max_concurrency)  # 限制并发量,防止瞬时高并发导致服务器拒绝或本地端口耗尽

    async with ClientSession(timeout=timeout) as session:#提升鲁棒性,避免单个请求阻塞整体任务
        tasks = [fetch_url(session, url, semaphore) for url in urls]
        results = await asyncio.gather(*tasks)  # 批量提交任务,使用 asyncio.gather 高效管理大量任务

        # 异步处理结果(如解析、存储)#解析和存储操作也异步执行,最大化利用 I/O 等待时间
        parsed_tasks = [parse_html(html) for html in results if html]
        parsed_results = await asyncio.gather(*parsed_tasks)

    # 性能统计
    # 统计总耗时、请求成功率等指标。
    total_time = time.time() - start_time
    success_count = sum(1 for res in results if res is not None)
    print(f"\n总耗时: {total_time:.2f} 秒")
    print(f"总请求数: {len(urls)}, 成功数: {success_count}")
    print(f"QPS: {len(urls) / total_time:.2f} (次/秒)")


if __name__ == "__main__":
    # 测试 URL 列表(示例)
    urls = [
               "https://www.example.com",
               "https://www.google.com",
               "https://www.github.com"
           ] * 100  # 重复 100 次模拟高并发

    # 运行事件循环
    asyncio.run(main(urls, max_concurrency=200))

 进阶优化技巧

  1. 连接池复用
    aiohttp.ClientSession 默认复用 TCP 连接,减少握手开销。

  2. DNS 缓存(Windows已测试,感觉没啥用,速度更慢了)
    使用 aiohttp.resolver.AsyncResolver 缓存 DNS 查询结果。

适用场景

    • 高频访问相同域名:如爬虫集中抓取某网站的子页面。

    • 减少 DNS 查询延迟:提升 API 轮询或实时通信性能。

import asyncio
from datetime import datetime, timedelta

import aiohttp
from aiohttp import ClientSession
from aiohttp.resolver import AsyncResolver

# 解决 Windows 下 aiodns 事件循环兼容性问题
if __name__ == "__main__":
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())

class CachedAsyncResolver(AsyncResolver):
    def __init__(self, ttl=300, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._cache = {}  # 缓存字典:{域名: (过期时间, [IP地址])}
        self._ttl = ttl  # 缓存有效期(秒)
        # 在初始化时启动清理缓存:
        # ...其他初始化...
        asyncio.create_task(self.cleanup_cache())
        #     线程安全改进:
        # 若需高并发安全,使用 asyncio.Lock 保护缓存操作:
        self._lock = asyncio.Lock()  # 添加异步锁

    async def resolve(self, hostname, port=0, family=0):
        async with self._lock:  # 加锁保证原子操作
            now = datetime.now()
            # 检查缓存是否存在且未过期
            if hostname in self._cache:
                expire_time, ips = self._cache[hostname]
                if now < expire_time:
                    print(f"使用缓存 DNS 结果: {hostname} -> {ips}")
                    return ips  # 直接返回缓存的 IP 列表

            # 缓存未命中,执行实际 DNS 查询
            ips = await super().resolve(hostname, port, family)
            # 更新缓存(设置过期时间)
            self._cache[hostname] = (
                now + timedelta(seconds=self._ttl),
                ips
            )
        print(f"新增 DNS 缓存: {hostname} -> {ips} (有效期至 {self._cache[hostname][0]})")
        return ips

    # 缓存清理机制:
    # 可添加后台任务定期清理过期缓存:
    async def cleanup_cache(self):
        while True:
            await asyncio.sleep(60)  # 每分钟清理一次
            now = datetime.now()
            expired_hosts = [
                host for host, (expire, _) in self._cache.items()
                if now >= expire
            ]
            for host in expired_hosts:
                del self._cache[host]
            print(f"清理过期缓存: {expired_hosts}")


async def main():
    # 创建自定义解析器(缓存 5 分钟)
    resolver = CachedAsyncResolver(ttl=300)
    # 使用自定义解析器
    # 通过 TCPConnector(resolver=resolver) 将解析器绑定到 ClientSession。
    # 首次请求会触发 DNS 查询并缓存结果,后续请求直接使用缓存。
    async with ClientSession(
            connector=aiohttp.TCPConnector(resolver=resolver)
    ) as session:
        # 第一次访问(触发 DNS 查询并缓存)
        async with session.get("https://www.baidu.com") as resp:
            print("第一次请求状态码:", resp.status)

        # 第二次访问(命中缓存)
        async with session.get("https://www.baidu.com") as resp:
            print("第二次请求状态码:", resp.status)


if __name__ == "__main__":
    asyncio.run(main())

 

测试代码

import asyncio
from datetime import datetime, timedelta
import re
import aiohttp
import requests
from aiohttp import AsyncResolver, ClientSession
from bs4 import BeautifulSoup
# import winloop
# winloop.install()
# 解决 Windows 下 aiodns 事件循环兼容性问题
if __name__ == "__main__":
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
class CachedAsyncResolver(AsyncResolver):
    def __init__(self, ttl=300, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._cache = {}  # 缓存字典:{域名: (过期时间, [IP地址])}
        self._ttl = ttl  # 缓存有效期(秒)
        # 在初始化时启动清理缓存:
        # ...其他初始化...
        asyncio.create_task(self.cleanup_cache())
        #     线程安全改进:
        # 若需高并发安全,使用 asyncio.Lock 保护缓存操作:
        self._lock = asyncio.Lock()  # 添加异步锁

    async def resolve(self, hostname, port=0, family=0):
        async with self._lock:  # 加锁保证原子操作
            now = datetime.now()
            # 检查缓存是否存在且未过期
            if hostname in self._cache:
                expire_time, ips = self._cache[hostname]
                if now < expire_time:
                    print(f"使用缓存 DNS 结果: {hostname} -> {ips}")
                    return ips  # 直接返回缓存的 IP 列表

            # 缓存未命中,执行实际 DNS 查询
            ips = await super().resolve(hostname, port, family)
            # 更新缓存(设置过期时间)
            self._cache[hostname] = (
                now + timedelta(seconds=self._ttl),
                ips
            )
        print(f"新增 DNS 缓存: {hostname} -> {ips} (有效期至 {self._cache[hostname][0]})")
        return ips

    # 缓存清理机制:
    # 可添加后台任务定期清理过期缓存:
    async def cleanup_cache(self):
        while True:
            await asyncio.sleep(60)  # 每分钟清理一次
            now = datetime.now()
            expired_hosts = [
                host for host, (expire, _) in self._cache.items()
                if now >= expire
            ]
            for host in expired_hosts:
                del self._cache[host]
            print(f"清理过期缓存: {expired_hosts}")

# 单个耗时任务
async def process_href(href, rootUrl, session):
    topicUrl = rootUrl + href

    # 创建自定义解析器(缓存 5 分钟)
    resolver = CachedAsyncResolver(ttl=300)
    # 使用自定义解析器
    # 通过 TCPConnector(resolver=resolver) 将解析器绑定到 ClientSession。
    # 首次请求会触发 DNS 查询并缓存结果,后续请求直接使用缓存。
    async with ClientSession(
            connector=aiohttp.TCPConnector(resolver=resolver)
    ) as session:
        # 第一次访问(触发 DNS 查询并缓存)
        async with session.get(topicUrl, headers=header) as res:
            soup = BeautifulSoup(await res.text(), "lxml")
            return await analysishtml(soup, href)


async def analysishtml(soup, href):
    # 一、获取题目所有说明文本
    allP = soup.find("div", id="content").find_all("p")
    # 创建字典,暂存每一页的各个板块信息
    dic = {}
    # 使用list暂存所有说明
    noteList = []
    for p in allP:
        if not p.find("a"):  # 排除“返回上一页”
            noteList.append(p.get_text())
    dic["noteList"] = noteList
    # 二、获取实例代码
    try:
        code = soup.find("div", id="content").find("div", class_="example")
        if code:
            dic["code"] = code.get_text()
        else:
            pre = soup.find("div", id="content").find("pre")
            dic["code"] = pre.get_text()
    except Exception as e:
        pre = soup.find("div", id="content").find("pre")
        print(pre)
        print(f"第{hreflist.index(href) + 1}题获取示例代码出错了:{e}")
    # 三、获取执行结果
    try:
        result = soup.find("div", id="content").find("pre")
        if result:
            dic["result"] = result.get_text()
        else:
            imgSrc = soup.find("div", id="content").find("img", attrs={"src": re.compile(r"^//")}).attrs["src"]
            dic["result"] = f"执行结果为非文本数据:数据链接为:https:{imgSrc}"
    except Exception as e:
        print(f"第{hreflist.index(href) + 1}题获取执行结果出错了:{e}")
        dic["result"] = "无执行结果!"
    return dic


# 写入文件
async def wirteInFile(pageList):
    with open("../Test/python100例题.txt", "w+", encoding="utf-8") as f:
        # with open("../Test/python100例题.doc","w+",encoding="utf-8") as f:
        for dic in pageList:
            noteList = dic["noteList"]
            code = dic["code"]
            result = dic["result"]
            try:
                if len(noteList) == 4:
                    f.write(f"{pageList.index(dic) + 1}" + noteList[0] + "\n")
                    f.write(noteList[1] + "\n")
                    f.write(noteList[2] + "\n")
                    f.write(code + "\n")
                    f.write(noteList[3] + "\n")
                    f.write(result + "\n")
                    f.write("*" * 50 + "\n")
                else:
                    f.write(f"{pageList.index(dic) + 1}" + noteList[0] + "\n")
                    f.write(noteList[1] + "\n")
                    f.write(code + "\n")
                    f.write(noteList[2] + "\n")
                    f.write(result + "\n")
                    f.write("*" * 50 + "\n")
            except Exception as e:
                print(f"{pageList.index(dic) + 1}出错了:{e}")
    print("写入完成!!!")


if __name__ == '__main__':
    url = "https://www.runoob.com/python/python-100-examples.html"
    rootUrl = "https://www.runoob.com"
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=header)
    response.encoding = response.apparent_encoding
    # 将网页内容转为bs4对象
    soup = BeautifulSoup(response.text, "lxml")
    # 查找题目链接
    uls = soup.find("div", id="content").find_all("ul")
    # 定义内容目录链接列表
    hreflist = []
    for ul in uls:
        lis = ul.find_all("li")
        for li in lis:
            hreflist.append(li.find("a").get("href"))


    async def main():
        time1 = datetime.now()
        print("*" * 20, time1.strftime('%Y-%m-%d %H:%M:%S'), "*" * 20)
        async with aiohttp.ClientSession(headers=header) as session:
            worklist = [process_href(href, rootUrl, session) for href in hreflist]
            pageList = await asyncio.gather(*worklist)
        await wirteInFile(pageList)
        time2 = datetime.now()
        print("结束时间", "*" * 20, time2.strftime('%Y-%m-%d %H:%M:%S'), "*" * 20)
        print(time2 - time1)


    asyncio.run(main())

 

 

  1. 动态调整并发
    根据服务器响应时间动态调整信号量值。

  2. 结合 UVLoop
    替换默认事件循环为 uvloop,提升性能 20-30%:

    import uvloop
    uvloop.install()
  • uvloop 是一个基于 libuv 的高性能异步事件循环库,其底层依赖的 C 扩展模块在 Windows 平台上存在兼容性问题。

官方明确声明不支持 Windows(GitHub 仓库说明),主要支持 Linux 和 macOS。

 缓存已下载页面(Windows已验证可用,时间稳定在:0:00:02.103662)

 模板代码

# 2. 缓存已下载页面
# 若需多次运行爬虫,可缓存已下载的 HTML 内容,避免重复请求:
import os
import hashlib

import requests


def get_page(url):
    cache_dir = "cache"
    os.makedirs(cache_dir, exist_ok=True)
    url_hash = hashlib.md5(url.encode()).hexdigest()
    cache_path = os.path.join(cache_dir, url_hash)

    if os.path.exists(cache_path):
        with open(cache_path, "r",encoding="utf-8")  as f:
            return f.read()
    else:
        response = requests.get(url)
        html = response.text
        with open(cache_path, "r",encoding="utf-8")  as f:
            f.write(html)
        return html

测试样例

import asyncio
import datetime
import hashlib
import os
import re
import aiohttp
import requests
from bs4 import BeautifulSoup

# import winloop
# winloop.install()

# 单个耗时任务
async def process_href(href, rootUrl, session):
    topicUrl = rootUrl + href

    # 检查是否存在缓存文件夹
    cache_dir = "../cache"
    os.makedirs(cache_dir, exist_ok=True)
    url_hash = hashlib.md5(topicUrl.encode()).hexdigest()
    cache_path = os.path.join(cache_dir, url_hash)

    if os.path.exists(cache_path):
        with open(cache_path, "r",encoding="utf-8") as f:
            soup = BeautifulSoup(f, "lxml")
            return await analysishtml(soup, href)
    else:
        async with session.get(topicUrl, headers=header) as res:
            soup = BeautifulSoup(await res.text(), "lxml")
            html = str(await res.text())
            with open(cache_path, "w",encoding="utf-8") as f:
                f.write(html)

            return await analysishtml(soup, href)

async def analysishtml(soup, href):
    # 一、获取题目所有说明文本
    allP = soup.find("div", id="content").find_all("p")
    # 创建字典,暂存每一页的各个板块信息
    dic = {}
    # 使用list暂存所有说明
    noteList = []
    for p in allP:
        if not p.find("a"):  # 排除“返回上一页”
            noteList.append(p.get_text())
    dic["noteList"] = noteList
    # 二、获取实例代码
    try:
        code = soup.find("div", id="content").find("div", class_="example")
        if code:
            dic["code"] = code.get_text()
        else:
            pre = soup.find("div", id="content").find("pre")
            dic["code"] = pre.get_text()
    except Exception as e:
        pre = soup.find("div", id="content").find("pre")
        print(pre)
        print(f"第{hreflist.index(href) + 1}题获取示例代码出错了:{e}")
    # 三、获取执行结果
    try:
        result = soup.find("div", id="content").find("pre")
        if result:
            dic["result"] = result.get_text()
        else:
            imgSrc = soup.find("div", id="content").find("img", attrs={"src": re.compile(r"^//")}).attrs["src"]
            dic["result"] = f"执行结果为非文本数据:数据链接为:https:{imgSrc}"
    except Exception as e:
        print(f"第{hreflist.index(href) + 1}题获取执行结果出错了:{e}")
        dic["result"] = "无执行结果!"
    return dic


# 写入文件
async def wirteInFile(pageList):
    with open("../Test/python100例题.txt", "w+", encoding="utf-8") as f:
        # with open("../Test/python100例题.doc","w+",encoding="utf-8") as f:
        for dic in pageList:
            noteList = dic["noteList"]
            code = dic["code"]
            result = dic["result"]
            try:
                if len(noteList) == 4:
                    f.write(f"{pageList.index(dic) + 1}" + noteList[0] + "\n")
                    f.write(noteList[1] + "\n")
                    f.write(noteList[2] + "\n")
                    f.write(code + "\n")
                    f.write(noteList[3] + "\n")
                    f.write(result + "\n")
                    f.write("*" * 50 + "\n")
                else:
                    f.write(f"{pageList.index(dic) + 1}" + noteList[0] + "\n")
                    f.write(noteList[1] + "\n")
                    f.write(code + "\n")
                    f.write(noteList[2] + "\n")
                    f.write(result + "\n")
                    f.write("*" * 50 + "\n")
            except Exception as e:
                print(f"{pageList.index(dic) + 1}出错了:{e}")
    print("写入完成!!!")


if __name__ == '__main__':
    url = "https://www.runoob.com/python/python-100-examples.html"
    rootUrl = "https://www.runoob.com"
    header = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
    }
    response = requests.get(url, headers=header)
    response.encoding = response.apparent_encoding
    # 将网页内容转为bs4对象
    soup = BeautifulSoup(response.text, "lxml")
    # 查找题目链接
    uls = soup.find("div", id="content").find_all("ul")
    # 定义内容目录链接列表
    hreflist = []
    for ul in uls:
        lis = ul.find_all("li")
        for li in lis:
            hreflist.append(li.find("a").get("href"))


    async def main():
        time1 = datetime.datetime.now()
        print("*" * 20, time1.strftime('%Y-%m-%d %H:%M:%S'), "*" * 20)
        async with aiohttp.ClientSession(headers=header) as session:
            worklist = [process_href(href, rootUrl, session) for href in hreflist]
            pageList = await asyncio.gather(*worklist)
        await wirteInFile(pageList)
        time2 = datetime.datetime.now()
        print("结束时间", "*" * 20, time2.strftime('%Y-%m-%d %H:%M:%S'), "*" * 20)
        print(time2 - time1)


    asyncio.run(main())

 

posted @ 2025-03-28 22:44  指尖下的世界  阅读(84)  评论(0)    收藏  举报