爬虫基础之头像图片爬取

 此爬虫能够爬取https://www.woyaogexing.com/touxiang/的所有头像图片,只需要修改相关参数即可!此仅作为学习所用!

class TouXiangSpider(object):
    def __init__(self, url, page_name):
        self.page_name = page_name
        self.url = url
        self.headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
        }

    # 启动路口
    def start(self):
        self.get_pic_href()

    # 请求函数
    def get_request(self, url, flag=None):
        if flag:
            return requests.get(url=url, headers=self.headers)
        else:
            response = requests.get(url=url, headers=self.headers)
            return etree.HTML(response.text)

    def get_pic_href(self):
        dir_name_list = []
        html = self.get_request(self.url)
        pic_href_list = html.xpath('//div[@class="pMain"]/div/a[last()]/@href')
        pic_dir_list = html.xpath('//div[@class="pMain"]/div/a[last()]/text()')
        # 定义文件名的正则替换规则,匹配非汉字字母数字下划线和/
        name_pattern = re.compile(r'[^\u4e00-\u9fa5\w/]+')
        for pic_dir_name in pic_dir_list:
            # 把文件夹的名字转换成中文,将匹配的字符替换为空
            dir_name = name_pattern.sub(r'', pic_dir_name.encode('ISO-8859-1').decode('utf-8')).replace('/', '-')
            dir_name_list.append(dir_name)
        for index, pic_href in enumerate(pic_href_list):
            full_pic_href = 'https://www.woyaogexing.com' + pic_href
            self.get_touxiang_href(full_pic_href, dir_name_list[index])

    # 获取头像的访问链接
    def get_touxiang_href(self, pic_url, dir_name):
        html = self.get_request(pic_url)
        touxiang_href_list = html.xpath('//li[@class="tx-img"]/a/@href')
        try:
            os.mkdir(f'E:/photo/{self.page_name}/{dir_name}')  # 创建储存每页的文件夹名字
        except:
            pass
        for name_index, touxiang_href in enumerate(touxiang_href_list):
            full_touxiang_href = 'https:' + touxiang_href
            self.get_picture(full_touxiang_href, name_index + 1, dir_name)
        pass

    # 保存图片
    def get_picture(self, url, pic_name, dir_name):
        response = self.get_request(url, flag=1)
        path = f'E:/photo/{self.page_name}/{dir_name}/{pic_name}.jpg'  # 保存图片的路径
        with open(path, 'wb') as fp:
            fp.write(response.content)

if __name__ == '__main__':
    index_url = 'https://www.woyaogexing.com/touxiang/'  # 首页的url
    page_url = 'https://www.woyaogexing.com/touxiang/index_{}.html'  # 每页的url
    for i in range(1, 21):  # 爬取头像的前20页的所有头像
        try:
            os.mkdir(f'E:/photo/page{i}')  # 创建每页存储的文件夹
        except:
            pass
        page_name = f'page{i}'
        if i == 1:
            spider = TouXiangSpider(index_url, page_name)
        else:
            spider = TouXiangSpider(page_url.format(i), page_name)
        spider.start()

 

posted @ 2021-06-26 13:30  千叶千影  阅读(303)  评论(0)    收藏  举报