'''
利用多线程、队列爬取表情包
URL:http://www.bbsnet.com/doutu/page/1
'''
import requests
from lxml import etree
import os
import re
from urllib import request
from queue import Queue
import threading
class Producer(threading.Thread):
'''
用于请求和解析网页,将下载地址及文件名放入队列
'''
def __init__(self,url_queue,img_queue,*args,**kwargs):
super().__init__(*args,**kwargs)
self.url_queue = url_queue
self.img_queue = img_queue
def run(self):
while True:
if self.url_queue.empty():
break
url = self.url_queue.get()
self.parse_page(url)
def parse_page(self,url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"
}
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
text = response.text
html = etree.HTML(text)
imgEle = html.xpath('//div[@class="tagbqppdiv"]//img')
for img in imgEle:
title = img.get('title')
img_url = img.get('data-original')
# 将title的中文字符进行替换处理
title = re.sub(r'[\-+*.?。,!?、/()“”">::]*', '', title)
# os.path.splitext() 函数将文件路径和文件名分开
new_title = title + os.path.splitext(img_url)[1]
# 将文件名和图片的url放到队列
self.img_queue.put((new_title,img_url))
class Consumer(threading.Thread):
'''
用于下载图片到本地
'''
def __init__(self, url_queue, img_queue, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url_queue = url_queue
self.img_queue = img_queue
def run(self):
while True:
if self.img_queue.empty() and self.url_queue.empty():
break
new_title, img_url = self.img_queue.get()
# 下载图片
request.urlretrieve(img_url,"./image/"+new_title)
print(new_title + " 下载完成!")
def main():
url_queue = Queue(100)
img_queue = Queue(500)
url = "https://fabiaoqing.com/biaoqing/lists/page/{}.html"
for i in range(1,101):
new_url = url.format(i)
url_queue.put(new_url)
for i in range(5):
p = Producer(url_queue,img_queue)
p.start()
for i in range(5):
c = Consumer(url_queue,img_queue)
c.start()
if __name__ == '__main__':
main()