import requests
from lxml import etree
import re
import os
from multiprocessing.dummy import Pool
import random
if __name__ == '__main__':
#创建视频得文件
if not os.path.exists("./video"):
os.mkdir("./video")
url="https://www.pearvideo.com/category_59"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4455.2 Safari/537.36"
}
page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
li_list=tree.xpath('//*[@id="listvideoList"]/ul/li')
video_ajax="https://www.pearvideo.com/videoStatus.jsp?"#通过抓包工具获取
urls = [] # 存储所有视频的链接and名字
for li in li_list:
video_id=li.xpath('./div/a/@href')[0]#得到视频id,如video_1727785
video_num=video_id.split('_')[1]#得到视频id里得数字
video_name=li.xpath("./div/a/div[2]/text()")[0]+'.mp4'
params={
'contId':video_num,
'mrd':str(random.random())#随机数
}
video_headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4455.2 Safari/537.36",
'Referer': 'https://www.pearvideo.com/' +video_id
}
video_dic=requests.get(url=video_ajax,headers=video_headers,params=params).json()
# print(video_dic)#此时就不会显示下架了,得到含有视频地址得字典
video_url=video_dic["videoInfo"]["videos"]["srcUrl"]
# 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址
# 真地址:"https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4"
# 伪地址:"https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4"
re_list=re.split('[/-]', video_url)
re_str=re_list[6]
video_true_url=video_url.replace(re_str,"cont-"+video_num)
# print(video_true_url)
dic={
"name":video_name,
"url":video_true_url
}
urls.append(dic)
#使用线程池对视频数进行请求
def get_video_data(dic):
url=dic["url"]
name=dic["name"]
print(name+"正在下载。。。。。。")
video_data=requests.get(url=url,headers=headers).content
with open("./video/"+name,"wb")as fp:
fp.write(video_data)
print(name+"下载成功!!!")
pool=Pool(4)
pool.map(get_video_data,urls)
pool.close()
pool.join()