爬取Microsoft Bing网站图片

说明
这个小案例主要是访问Microsoft Bing网站去爬取“车牌”图片,代码写的时候不规范,但是效果还行,更快速的异步爬虫看这个链接:使用python协程爬取图片

  • 代码文件结构为下图:

  • 具体思路

#main.py
#运行函数的入口
from requests import get
# import time
# from tqdm import tqdm
import os
from download_img import download
def download_html(search_len,search_name,url,headers,params):
    for i in range(int(search_len)):
        response=get(url,headers=headers,params=params)
        image_html=response.text
        with open(f'./{search_name}_html/{search_name}'+f'_{i}'+'.html','w',encoding='utf-8') as fp:
            fp.write(image_html)
        params['first']+=params['count']
        params['SFX']+=1
        params['count']=35
def main(search_name=None,search_len=None):
    url='https://cn.bing.com/images/async'
    headers={
    'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Mobile Safari/537.36 Edg/134.0.0.0'
           
     
            }
    params={
    'q':search_name,
    'first': 13,
    'count': 12,#爬取图片的数量
    'cw': 437,
    'ch': 603,
    'relp': 12,
    'datsrc': 'I',
    'layout': 'ColumnBased_Landscape',
    'apc': 0,
    'imgbf': 'DfCtqwgAAACQAQAAAAAAAAAAAAAFAAAAsUuBIONfTNlAgBAASAEAQgAQIFEABiAGIBgAgNDAoQAIUAAIgUYIAAiAQCAAABIiIBgIIAAAACkCAAAAAAAAAA==',
    'mmasync': 2,
    'dgState': 'c*2_y*725s715_i*13_w*204',
    'IG': '32F3E4B3953D4FFCB5B4E5EDB527C8EC',
    'SFX': 2,#页数
    'iid': 'images.5306',
            }

    if not os.path.exists(f'./{search_name}_html'):
        os.mkdir(f'./{search_name}_html')
    download_html(search_len,search_name,url,headers,params)
    for html in os.listdir(f'./{search_name}_html'):
        download(html,search_name)
    
if __name__ == '__main__':
    search_name=input('请输入搜索内容:')
    search_len=input('请输入要爬取图片页数:')
    main(search_name,search_len)

#download_img.py
from lxml import etree
from requests import get
# import os
from os.path import exists
from os import mkdir
import re
from re import compile
# from tqdm import tqdm
# from 爬取html import dir_name
def download(html_name,dir_name):
    headers={
        'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Mobile Safari/537.36 Edg/134.0.0.0',
    
    }
    with open(f'./{dir_name}_html/{html_name}',encoding='utf-8') as f:
        text=f.read()

    tree=etree.HTML(text)
    img_src_list=tree.xpath('//img/@data-src')
    p=re.compile(r'https.*/OIP-C\.(?P<name>.*)\?.*')
    if not exists(f'./{dir_name}_image_dir'):
        mkdir(f'./{dir_name}_image_dir')
    for url in img_src_list:
        response=get(url,headers=headers).content
        
        with open(f'./{dir_name}_image_dir/'+p.search(url).group('name')+'.jpg','wb') as f:
            f.write(response)

使用多线程改写上述代码

import threading
import time
import os
import queue
import requests
from lxml import etree
import re
def download_img():
    response=requests.get(q_url.get(),headers=headers)
    if not os.path.exists(f"./{search_name}_img"):
        os.mkdir(f"./{search_name}_img")
    with open(f"./{search_name}_img/"+q_name.get(),'wb') as fp:
        fp.write(response.content)
        q_name.task_done()
    
def download_url(i):
    # global url_list
    
    with condition:
        response=requests.get(url,headers=headers,params=params)

        image_html=response.text
        # if not os.path.exists(f"./{search_name}_html"):
        #     os.mkdir(f"./{search_name}_html")
        # with open(f"{search_name}_html/{i}.html",mode="w",encoding='utf-8') as f:
        #     f.write(image_html)
        tree=etree.HTML(image_html)
        # 获取图片链接
        img_src_list=tree.xpath('//img/@data-src')
        # 正则匹配图片名称
        p=re.compile(r'https.*/OIP-C\.(?P<name>.*)\?.*')
        for i in img_src_list:
            q_url.put(i)
            q_name.put(p.search(i).group('name')+'.jpg')
        params['first']+=params['count']
        params['SFX']+=1
        params['count']=35
        # condtion.wait()
    print(f'{threading.currentThread().name}已经完成,现在q_url的长度为{q_url.qsize()}')

def main():
    
    download_url_list=[threading.Thread(target=download_url,args=(i,)) for i in range(int(search_len))]
    for i in download_url_list:
        i.start()
    for i in download_url_list:
        i.join()
    download_img_list=[threading.Thread(target=download_img) for i in range(q_url.qsize())]
    for i in download_img_list:
        i.start()
    for i in download_img_list:
        i.join()

    # q_url.join()
    # q_name.join()

if __name__ == '__main__':
    s=time.time()
    #该队列主要是存放图片的下载地址
    q_url=queue.Queue()
    q_name=queue.Queue()
    # condtion=threading.Condition()
    condition=threading.RLock()
    # condition1=threading.RLock()
    url_list=[]
    headers={
        'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Mobile Safari/537.36 Edg/134.0.0.0',   
    }
    url='https://cn.bing.com/images/async'
    search_name=input('请输入搜索内容:')
    search_len=input('请输入要爬取图片页数:')
    
    params={
    'q':search_name,
    'first': 13,
    'count': 12,#爬取图片的数量
    'cw': 437,
    'ch': 603,
    'relp': 12,
    'datsrc': 'I',
    'layout': 'ColumnBased_Landscape',
    'apc': 0,
    'imgbf': 'DfCtqwgAAACQAQAAAAAAAAAAAAAFAAAAsUuBIONfTNlAgBAASAEAQgAQIFEABiAGIBgAgNDAoQAIUAAIgUYIAAiAQCAAABIiIBgIIAAAACkCAAAAAAAAAA==',
    'mmasync': 2,
    'dgState': 'c*2_y*725s715_i*13_w*204',
    'IG': '32F3E4B3953D4FFCB5B4E5EDB527C8EC',
    'SFX': 2,#页数
    'iid': 'images.5306',
            }
    main()
    e=time.time()
    print(f'耗费时间{e-s:.2f}s')
posted @ 2025-03-12 13:19  CodeCraftsMan  阅读(513)  评论(0)    收藏  举报