批量下载百度图片+保存到本地+全套代码+标准格式

"""批量下载图片"""

from urllib.request import build_opener, install_opener, urlopen, urlretrieve
from urllib.parse import quote, urlparse
import os
import re
bdimg = "https://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word="

class Crawler:
    """百度图片爬虫"""

    def __init__(self):
        """初始化爬虫实例,设置图片保存目录"""
        pydir = os.path.dirname(os.path.abspath(__file__))
        self.imgdir = os.path.join(pydir, "img")
        if not os.path.exists(self.imgdir):
            os.mkdir(self.imgdir)
        self.cnt = 0

    def getlinks(self, word, pn):
        """获取链接列表"""
        links = []
        print("搜索词条", word)
        try:
            #向百度图片搜索提交指定关键词获取结果网页
            html = urlopen(bdimg + quote(word) + "&pn="+str(pn*20)).read().decode()
            #使用正则表达式从搜索结果网页源代码中提取原始图片链接
            links = re.findall(r'"objURL":"(.+?)"', html)
        except Exception as ex:
            print("发生错误", repr(ex))
        return links

    def savefile(self, pn, i, url):
        """将链接保存为文件"""
        imgfile = os.path.join(self.imgdir, str(pn+1)+"-"+ str(i+1)+".jpg")
        #imgfile = os.path.join(self.imgdir, quote(url, ""))
        #print(url)
        if not os.path.exists(imgfile):
            print(f"保存第{pn+1}页第{i+1}文件", url)
            try:
                res = urlparse(url)
                #为HTTP请求构建特定的报头
                #部分网站会检查客户端是否提供必要信息
                domain = res.hostname.split(".", 1)[1]
                opener = build_opener()
                opener.addheaders = [
                    ("User-Agent", "Mozilla/5.0"),
                    ("Referer", f"{res.scheme}://www.{domain}/")]
                install_opener(opener)                
                urlretrieve(url, imgfile) #将链接保存为文件
                self.cnt += 1
            except Exception as ex:
                print("发生错误", repr(ex))

if __name__ == "__main__":
    c = Crawler()
    pages =5
    for pn in range(pages):
        links = c.getlinks("美女", pn)
        for i,url in enumerate(links):
            c.savefile(pn, i, url)
    
    print(f"运行结束,共下载{c.cnt}个文件")







                
            

  

posted @ 2022-01-27 12:05  白月如初12138  阅读(383)  评论(0)    收藏  举报