百度图片爬虫

源代码出自大象乔布斯

import requests
import re
import os
import random
import time


def get_html(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"}
    try:
        html = requests.get(url,headers = headers)
        html.encoding = html.apparent_encoding
        if html.status_code == 200:
            # print(html.text)
            print("成功获取源码")

    except Exception as e:
        print("抓取源代码发生错误:%s" % e)

    return html.text

def parsehtml(html):
    urls = re.findall('"objURL":"(.*?)"',str(html),re.S)
   # print(html)
    return urls

def downloadImg(urls,wold):
    if wold in os.listdir():
        pass
    else:
        os.mkdir(wold)
    os.chdir(wold)

    i = 1
    for url in urls:
        time.sleep(random.randint(1,3) + random.random())
        img = requests.get(url,timeout = 6).content

        if img:
            with open(wold + str(i) + '.jpg' ,'wb') as f:
                print("正在下载第 %d 张图片:%s " % (i, url))
                f.write(img)
                print("图片下载成功")
                i += 1
        else:
            print("连接超时,图片下载失败")
#打开记得关闭 f.close()
if __name__ == '__main__': wold = input("请输入您要下载的图片:") url = "http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=" + wold + "&ct=201326592&v=flip" html = get_html(url) urls = parsehtml(html) downloadImg(urls,wold)

 

posted @ 2021-12-26 20:17  powermain  阅读(55)  评论(0)    收藏  举报