举一反三的爬虫 - po3a

同时多线程处理，提升计算速度
import requests
import re
import os
import numpy as np
import cv2
import threading
import time


class baidu_spider():
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
        }
        # 改为模板字符串，方便后续更新页码
        self.url_template = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={keyword}&pn={page}'
        self.current_page = 0
        self.image_list = []
        self.image_list_index = 0
        self.lock = threading.Lock()
        self.event = threading.Event()
        self.stop_flag = False
        self.current_keyword = None
        self.max_retries = 3  # 每个URL最大重试次数
        self.timeout = 10  # 请求超时时间(秒)

    def make_spider_list(self, keyword):
        self.current_keyword = keyword
        self.current_page = 0

        while not self.stop_flag:
            # 检查队列中是否需要补充图片
            with self.lock:
                need_images = 10 - (len(self.image_list) - self.image_list_index)

            if need_images > 0:
                # 构造新的URL
                url = self.url_template.format(keyword=keyword, page=self.current_page * 15)
                self.current_page += 1

                try:
                    # 获取图片列表页面
                    res = requests.get(url, headers=self.headers, timeout=self.timeout)
                    html = res.content.decode('utf-8', errors='ignore')

                    # 提取图片URL
                    image_urls = re.findall('"objURL":"(.*?)",', html)

                    if not image_urls:
                        print(f"在第{self.current_page}页没有找到图片URL，可能被反爬")
                        time.sleep(2)  # 等待一段时间再重试
                        continue

                    # 处理每个图片URL
                    for img_url in image_urls:
                        if self.stop_flag:
                            break

                        retries = 0
                        while retries < self.max_retries:
                            try:
                                # 下载图片
                                img_response = requests.get(img_url, headers=self.headers, timeout=self.timeout)
                                img_response.raise_for_status()  # 检查HTTP状态码

                                # 转换为OpenCV图像
                                img_array = np.array(bytearray(img_response.content), dtype=np.uint8)
                                img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)

                                if img is not None:
                                    with self.lock:
                                        self.image_list.append(img.copy())
                                        self.event.set()  # 触发事件

                                    print(f"成功下载图片 {len(self.image_list)}")
                                    break  # 成功下载一张后继续处理下一个URL
                                else:
                                    raise ValueError("无法解码图像")

                            except Exception as e:
                                retries += 1
                                print(f"下载图片失败 ({retries}/{self.max_retries}): {str(e)}")
                                time.sleep(1)  # 重试前等待1秒

                except Exception as e:
                    print(f"获取图片列表失败: {str(e)}")
                    time.sleep(2)  # 等待更长时间
            else:
                # 队列已满，休眠一段时间
                time.sleep(0.5)

        print("爬虫线程已停止")

    def start_return_spider_image(self):
        """获取下一张图片，如果队列为空则等待"""
        # 等待第一张图片加载完成
        self.event.wait()

        while True:
            with self.lock:
                if self.image_list_index < len(self.image_list):
                    image = self.image_list[self.image_list_index]
                    self.image_list_index += 1
                    return image
                elif self.stop_flag:
                    return None  # 爬虫已停止且没有更多图片

            # 队列中没有图片，等待新图片加载
            self.event.clear()
            self.event.wait()

    def parameter_init(self):
        """重置所有参数"""
        with self.lock:
            self.current_page = 0
            self.image_list = []
            self.image_list_index = 0
            self.stop_flag = False
            self.current_keyword = None
            self.event.clear()

    def stop(self):
        """停止爬虫"""
        self.stop_flag = True
        self.event.set()  # 确保等待的线程可以继续
posted on 2025-04-29 09:25 po3a 阅读(9) 评论(0) 收藏举报
刷新页面返回顶部

博客园 © 2004-2025 浙公网安备 33010602011771号浙ICP备2021040463号-3
导航