爬虫-多线程抓取斗图表情
import requests
from lxml import etree
import os
import time
from multiprocessing import Pool
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
def get_img_src(page):
'''
抓取页面的src
:return:
'''
for i in range(1, page + 1):
url = f'https://www.pkdoutu.com/photo/list/?page={i}'
print(f'开始抓取第{i}页数据')
time.sleep(1)
res = requests.get(url, headers=headers)
res.encoding = res.apparent_encoding
res_data = res.text
tree = etree.HTML(res_data)
# 抓取图片地址
img_list = tree.xpath('//img[@referrerpolicy="no-referrer"]/@data-original')
yield img_list
def download_img(url):
'''
下载图片
:return:
'''
time.sleep(0.1)
img_res = requests.get(url, headers=headers)
img_name = url.split('_')[-1] # 拆分
# 写入到图片
# print(f'正在下载{img_name}')
path = 'img'
if not os.path.exists(path):
os.mkdir(path)
with open(os.path.join(path, img_name), 'wb') as f:
f.write(img_res.content)
if __name__ == '__main__':
pool = Pool(10)
# 通过生成器get_img_src,返回图片列表
for url_list in get_img_src(10):
for url in url_list:
# 加入进程池
pool.apply_async(download_img, args=(url, ))
pool.close() # 关闭进程池
pool.join() # 等待进程
print('抓取结束')
效果展示:


浙公网安备 33010602011771号