批量爬取10000张百度图片

1.分析网页
按F12打开开发者模式。
import urllib.request
import urllib.parse
import re
import os
x = 1  # 文件名字起始
def Imgpath(word):
    """
    :param word:  传入搜索关键字
    :return: 下载文件夹路径
    """

    file_path = os.getcwd()[:-4] + word            # 获得当前的文件路径后创建带有关键词的路径
    if not os.path.exists(file_path):              # 判断新建路径是否已经存在
        os.makedirs(file_path)                     # 不存在，创建文件夹
    else:
        file_path = file_path + '1'                # 存在，给文件夹重新命名
        os.makedirs(file_path )                    # 创建文件夹
    return file_path



# 在遍历地址中下载图片
def Imgurl(word):
    """
    :param word: 搜索关键字
    :return: 图片真实地址
    """
    global x  # 文件名计数
    img_path = Imgpath(word)  # 创建文件夹
    rep_list = []   # 创建地址列表
    # 模拟浏览器，需要用到浏览器的信息和目标url
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
        "referer": "https://image.baidu.com"
    }
    # 将中文关键字加密成浏览器能识别的乱码
    content= urllib.parse.quote(word,encoding='utf-8')
    # 依据pn的规律从30到121循环4次，间隔为30

    for num in range(30,10000,30):
        gsm = hex(num)[2:]         # 将十进制数num转换成16进制数并取后两位
        # url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord='+content+'&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=&z=&ic=&word='+content+'&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=&fr=&pn='+str(num)+'&rn=30&gsm='+ gsm   #根据规律每次循环生成正确的请求地址
        # ===========================================重要=====================================================
        # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>找出规律格式化字符串生成url下面是例子<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
        url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10921846348672786587&ipn=rj&ct=201326592&is=&fp=result&queryWord='+content+'&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word='+content+'&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=star&pn='+str(num)+'&rn=30&gsm='+gsm
        # bbb = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10921846348672786587&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%88%98%E4%BA%A6%E8%8F%B2&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%E5%88%98%E4%BA%A6%E8%8F%B2&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=star&pn=60&rn=30&gsm=3c&1615259916338='
        # ccc = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10921846348672786587&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%88%98%E4%BA%A6%E8%8F%B2&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%E5%88%98%E4%BA%A6%E8%8F%B2&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=star&pn=90&rn=30&gsm=5a&1615259926072='
        # ddd = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10921846348672786587&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%88%98%E4%BA%A6%E8%8F%B2&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%E5%88%98%E4%BA%A6%E8%8F%B2&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=star&pn=120&rn=30&gsm=78&1615259926281='
        # eee = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10921846348672786587&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%88%98%E4%BA%A6%E8%8F%B2&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%E5%88%98%E4%BA%A6%E8%8F%B2&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=star&pn=150&rn=30&gsm=96&1615260040473='
        # fff = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10921846348672786587&ipn=rj&ct=201326592&is=&fp=result&queryWord=%E5%88%98%E4%BA%A6%E8%8F%B2&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&word=%E5%88%98%E4%BA%A6%E8%8F%B2&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&force=&cg=star&pn=180&rn=30&gsm=b4&1615260040625='

        req = urllib.request.Request(url=url,headers=header)   # 获得请求对象
        page = urllib.request.urlopen(req).read()     # 请求并读取返回信息
        try:              # 如果返回信息遇到不在utf-8范围内的字符，跳过
            response = page.decode('utf-8')                       # 解码返回的信息
            imgpattern = re.compile(r'"thumbURL":"(.*?)\.jpg')    # 编写正则
            rsp_data = re.findall(imgpattern, response)           # 通过正则匹配
            rep_list += rsp_data     # 加入每个地址

            # 拿到地址就开始下载了

            for url in rep_list[:100]:  # 循环提取Imgurl列表中的前100个字符串
                pngurl = url.replace(r'"thumbURL":"', " ")  # 获得字符串里面的url
                path = img_path + '\\' + word + str(x) + '.png'  # 下载图片的路径
                pngdata = urllib.request.urlopen(pngurl).read()  # 下载图片数据
                f = open(path, 'wb')  # 必须用二进制写入
                f.write(pngdata)  # 下载图片
                f.close()
                x += 1  # 张数计数
                print('第%s张'%x)

        except UnicodeDecodeError:
            pass

if __name__ == '__main__':
    word = input("请输入中文关键词：")
    Imgurl(word)
posted @ 2021-03-09 11:27 trysocket 阅读(380) 评论(0) 收藏举报
刷新页面返回顶部
trysocket

批量爬取10000张百度图片

公告