爬虫基础之头像图片爬取
此爬虫能够爬取https://www.woyaogexing.com/touxiang/的所有头像图片,只需要修改相关参数即可!(此仅作为学习所用!)
class TouXiangSpider(object): def __init__(self, url, page_name): self.page_name = page_name self.url = url self.headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36' } # 启动路口 def start(self): self.get_pic_href() # 请求函数 def get_request(self, url, flag=None): if flag: return requests.get(url=url, headers=self.headers) else: response = requests.get(url=url, headers=self.headers) return etree.HTML(response.text) def get_pic_href(self): dir_name_list = [] html = self.get_request(self.url) pic_href_list = html.xpath('//div[@class="pMain"]/div/a[last()]/@href') pic_dir_list = html.xpath('//div[@class="pMain"]/div/a[last()]/text()') # 定义文件名的正则替换规则,匹配非汉字字母数字下划线和/ name_pattern = re.compile(r'[^\u4e00-\u9fa5\w/]+') for pic_dir_name in pic_dir_list: # 把文件夹的名字转换成中文,将匹配的字符替换为空 dir_name = name_pattern.sub(r'', pic_dir_name.encode('ISO-8859-1').decode('utf-8')).replace('/', '-') dir_name_list.append(dir_name) for index, pic_href in enumerate(pic_href_list): full_pic_href = 'https://www.woyaogexing.com' + pic_href self.get_touxiang_href(full_pic_href, dir_name_list[index]) # 获取头像的访问链接 def get_touxiang_href(self, pic_url, dir_name): html = self.get_request(pic_url) touxiang_href_list = html.xpath('//li[@class="tx-img"]/a/@href') try: os.mkdir(f'E:/photo/{self.page_name}/{dir_name}') # 创建储存每页的文件夹名字 except: pass for name_index, touxiang_href in enumerate(touxiang_href_list): full_touxiang_href = 'https:' + touxiang_href self.get_picture(full_touxiang_href, name_index + 1, dir_name) pass # 保存图片 def get_picture(self, url, pic_name, dir_name): response = self.get_request(url, flag=1) path = f'E:/photo/{self.page_name}/{dir_name}/{pic_name}.jpg' # 保存图片的路径 with open(path, 'wb') as fp: fp.write(response.content) if __name__ == '__main__': index_url = 'https://www.woyaogexing.com/touxiang/' # 首页的url page_url = 'https://www.woyaogexing.com/touxiang/index_{}.html' # 每页的url for i in range(1, 21): # 爬取头像的前20页的所有头像 try: os.mkdir(f'E:/photo/page{i}') # 创建每页存储的文件夹 except: pass page_name = f'page{i}' if i == 1: spider = TouXiangSpider(index_url, page_name) else: spider = TouXiangSpider(page_url.format(i), page_name) spider.start()
天青色等烟雨而我在等你!