基于线程池的异步爬虫

from lxml import etree
import re
from multiprocessing.dummy import Pool


'下载梨视频生活栏目中最热的视频'

headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'
}
# 访问生活栏目
html_live = requests.get(url='https://www.pearvideo.com/category_5', headers=headers).text

# 拿到生活栏目最热视频的名字和视频地址
response = etree.HTML(html_live)
list_li = response.xpath('//ul[@id="listvideoListUl"]/li')
ls = []
for li in list_li:
    name = li.xpath('./div/a/div[@class="vervideo-title"]/text()')[0] + '.mp4'
    url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
    dic = {
        'url': url,
        'name': name
    }
    ls.append(dic)


# 访问视频地址下载视频
def get_video(dic):
    url = dic["url"]
    name = dic["name"]
    print(name + '正在下载........')
    page_data = requests.get(url=url, headers=headers).text
    video_url = re.findall('srcUrl=(.*?),', page_data)[0].replace('"', '')
    video = requests.get(url=video_url, headers=headers).content
    # 把视频数据写入到本地
    with open(name, 'wb')as fp:
        fp.write(video)
        print(name + '下载完成')


pool = Pool(4)
pool.map(get_video, ls)

# 等待子线程结束后关闭线程池
pool.close()
pool.join()
posted @ 2020-06-25 16:18 bibicode 阅读(104) 评论(0) 收藏举报
刷新页面返回顶部