百度图片爬虫
源代码出自大象乔布斯
import requests import re import os import random import time def get_html(url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"} try: html = requests.get(url,headers = headers) html.encoding = html.apparent_encoding if html.status_code == 200: # print(html.text) print("成功获取源码") except Exception as e: print("抓取源代码发生错误:%s" % e) return html.text def parsehtml(html): urls = re.findall('"objURL":"(.*?)"',str(html),re.S) # print(html) return urls def downloadImg(urls,wold): if wold in os.listdir(): pass else: os.mkdir(wold) os.chdir(wold) i = 1 for url in urls: time.sleep(random.randint(1,3) + random.random()) img = requests.get(url,timeout = 6).content if img: with open(wold + str(i) + '.jpg' ,'wb') as f: print("正在下载第 %d 张图片:%s " % (i, url)) f.write(img) print("图片下载成功") i += 1 else: print("连接超时,图片下载失败")
#打开记得关闭 f.close() if __name__ == '__main__': wold = input("请输入您要下载的图片:") url = "http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=" + wold + "&ct=201326592&v=flip" html = get_html(url) urls = parsehtml(html) downloadImg(urls,wold)
浙公网安备 33010602011771号