1 import requests
2 from lxml import etree
3 import random
4 import os
5 from multiprocessing.dummy import Pool
6
7 if __name__ == '__main__':
8 # 生成一个存视频的文件夹
9 if not os.path.exists('./video'):
10 os.mkdir('./video')
11
12 url = 'https://www.pearvideo.com/category_5'
13 headers = {
14 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'
15 }
16 # proxies={'https': '62.210.38.37:3838'} 代理ip,用了太慢
17 response = requests.get(url=url, headers=headers)
18 page_text = response.text
19
20 tree = etree.HTML(page_text)
21 li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
22
23 urls = [] # 储存所有视频的连接和名字
24 for li in li_list:
25 new_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
26 new_name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
27 # 这个方法行不通。因为mp4是动态加载出来的,因此需要抓包ajax请求中的url,不知道怎么用python抓包,用浏览器的抓包工具
28 new_page_text = requests.get(url=new_url, headers=headers).text
29 new_tree = etree.HTML(new_page_text)
30 name = new_tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/h1/text()')[0]
31 # print(name)
32
33 # 通过抓包ajax得到一个可以发送的url和请求伪装的视频的url,
34 id_ = str(li.xpath('./div/a/@href')[0]).split('_')[1]
35 # 可发送请求的url
36 ajax_url = 'https://www.pearvideo.com/videoStatus.jsp?'
37 params = {
38 'contId': id_,
39 'mrd': str(random.random())
40 }
41 ajax_headers = {
42 "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400',
43 'Referer': 'https://www.pearvideo.com/video_' + id_
44 }
45 # 加了'Referer': 'https://www.pearvideo.com/video_1708144'后就不会显示该视频已下架了
46 dic_obj = requests.get(url=ajax_url, params=params, headers=ajax_headers).json()
47 video_url = dic_obj["videoInfo"]['videos']["srcUrl"]
48
49 # 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址
50 # 真地址:"https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4"
51 # 伪地址:"https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4"
52
53 # 得到真url,做字符串处理
54 video_true_url = ''
55 s_list = str(video_url).split('/')
56 # print(s_list)
57 for i in range(0, len(s_list)):
58 if i < len(s_list) - 1:
59 video_true_url += s_list[i] + '/'
60 else:
61 ss_list = s_list[i].split('-')
62 # print(ss_list)
63 for j in range(0, len(ss_list)):
64 if j == 0:
65 video_true_url += 'cont-' + id_ + '-'
66 elif j == len(ss_list) - 1:
67 video_true_url += ss_list[j]
68 else:
69 video_true_url += ss_list[j] + '-'
70 # print(video_true_url)
71
72 dic = {
73 'name': name,
74 'url': video_true_url
75 }
76 urls.append(dic)
77
78 # 使用线程池对视频数据进行请求(较为耗时的阻塞操作)
79 def get_video_data(dic_):
80 url_ = dic_['url']
81 print(dic_['name'], '正在下载.....')
82 video_data = requests.get(url=url_, headers=headers).content
83 video_path = './video/' + dic_['name']
84 with open(video_path, 'wb') as fp:
85 fp.write(video_data)
86 print(dic_['name'], '下载成功!!!!!')
87
88
89 pool = Pool(4)
90 pool.map(get_video_data, urls)
91
92 pool.close()
93 pool.join()