"""批量下载图片"""
from urllib.request import build_opener, install_opener, urlopen, urlretrieve
from urllib.parse import quote, urlparse
import os
import re
bdimg = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word="
class Crawler:
"""百度图片爬虫"""
def __init__(self):
"""初始化爬虫实例,设置图片保存目录"""
pydir = os.path.dirname(os.path.abspath(__file__))
self.imgdir = os.path.join(pydir, "img")
if not os.path.exists(self.imgdir):
os.mkdir(self.imgdir)
self.cnt = 0
def getlinks(self, word, pn):
"""获取链接列表"""
links = []
print("搜索词条", word)
try:
#向百度图片搜索提交指定关键词获取结果网页
html = urlopen(bdimg + quote(word) + "&pn="+str(pn*20)).read().decode()
#使用正则表达式从搜索结果网页源代码中提取原始图片链接
links = re.findall(r'"objURL":"(.+?)"', html)
except Exception as ex:
print("发生错误", repr(ex))
return links
def savefile(self, pn, i, url):
"""将链接保存为文件"""
imgfile = os.path.join(self.imgdir, str(pn+1)+"-"+ str(i+1)+".jpg")
#imgfile = os.path.join(self.imgdir, quote(url, ""))
#print(url)
if not os.path.exists(imgfile):
print(f"保存第{pn+1}页第{i+1}文件", url)
try:
res = urlparse(url)
#为HTTP请求构建特定的报头
#部分网站会检查客户端是否提供必要信息
domain = res.hostname.split(".", 1)[1]
opener = build_opener()
opener.addheaders = [
("User-Agent", "Mozilla/5.0"),
("Referer", f"{res.scheme}://www.{domain}/")]
install_opener(opener)
urlretrieve(url, imgfile) #将链接保存为文件
self.cnt += 1
except Exception as ex:
print("发生错误", repr(ex))
if __name__ == "__main__":
c = Crawler()
pages =5
for pn in range(pages):
links = c.getlinks("美女", pn)
for i,url in enumerate(links):
c.savefile(pn, i, url)
print(f"运行结束,共下载{c.cnt}个文件")