同时多线程处理,提升计算速度
import requests import re import os import numpy as np import cv2 import threading import time class baidu_spider(): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36' } # 改为模板字符串,方便后续更新页码 self.url_template = 'https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word={keyword}&pn={page}' self.current_page = 0 self.image_list = [] self.image_list_index = 0 self.lock = threading.Lock() self.event = threading.Event() self.stop_flag = False self.current_keyword = None self.max_retries = 3 # 每个URL最大重试次数 self.timeout = 10 # 请求超时时间(秒) def make_spider_list(self, keyword): self.current_keyword = keyword self.current_page = 0 while not self.stop_flag: # 检查队列中是否需要补充图片 with self.lock: need_images = 10 - (len(self.image_list) - self.image_list_index) if need_images > 0: # 构造新的URL url = self.url_template.format(keyword=keyword, page=self.current_page * 15) self.current_page += 1 try: # 获取图片列表页面 res = requests.get(url, headers=self.headers, timeout=self.timeout) html = res.content.decode('utf-8', errors='ignore') # 提取图片URL image_urls = re.findall('"objURL":"(.*?)",', html) if not image_urls: print(f"在第{self.current_page}页没有找到图片URL,可能被反爬") time.sleep(2) # 等待一段时间再重试 continue # 处理每个图片URL for img_url in image_urls: if self.stop_flag: break retries = 0 while retries < self.max_retries: try: # 下载图片 img_response = requests.get(img_url, headers=self.headers, timeout=self.timeout) img_response.raise_for_status() # 检查HTTP状态码 # 转换为OpenCV图像 img_array = np.array(bytearray(img_response.content), dtype=np.uint8) img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) if img is not None: with self.lock: self.image_list.append(img.copy()) self.event.set() # 触发事件 print(f"成功下载图片 {len(self.image_list)}") break # 成功下载一张后继续处理下一个URL else: raise ValueError("无法解码图像") except Exception as e: retries += 1 print(f"下载图片失败 ({retries}/{self.max_retries}): {str(e)}") time.sleep(1) # 重试前等待1秒 except Exception as e: print(f"获取图片列表失败: {str(e)}") time.sleep(2) # 等待更长时间 else: # 队列已满,休眠一段时间 time.sleep(0.5) print("爬虫线程已停止") def start_return_spider_image(self): """获取下一张图片,如果队列为空则等待""" # 等待第一张图片加载完成 self.event.wait() while True: with self.lock: if self.image_list_index < len(self.image_list): image = self.image_list[self.image_list_index] self.image_list_index += 1 return image elif self.stop_flag: return None # 爬虫已停止且没有更多图片 # 队列中没有图片,等待新图片加载 self.event.clear() self.event.wait() def parameter_init(self): """重置所有参数""" with self.lock: self.current_page = 0 self.image_list = [] self.image_list_index = 0 self.stop_flag = False self.current_keyword = None self.event.clear() def stop(self): """停止爬虫""" self.stop_flag = True self.event.set() # 确保等待的线程可以继续

浙公网安备 33010602011771号