b站路飞学城python课梨视频项目代码

 1 import requests
 2 from lxml import etree
 3 import random
 4 import os
 5 from multiprocessing.dummy import Pool
 6 
 7 if __name__ == '__main__':
 8     # 生成一个存视频的文件夹
 9     if not os.path.exists('./video'):
10         os.mkdir('./video')
11 
12     url = 'https://www.pearvideo.com/category_5'
13     headers = {
14         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400'
15     }
16     # proxies={'https': '62.210.38.37:3838'} 代理ip,用了太慢
17     response = requests.get(url=url, headers=headers)
18     page_text = response.text
19 
20     tree = etree.HTML(page_text)
21     li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
22 
23     urls = []  # 储存所有视频的连接和名字
24     for li in li_list:
25         new_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
26         new_name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
27         # 这个方法行不通。因为mp4是动态加载出来的,因此需要抓包ajax请求中的url,不知道怎么用python抓包,用浏览器的抓包工具
28         new_page_text = requests.get(url=new_url, headers=headers).text
29         new_tree = etree.HTML(new_page_text)
30         name = new_tree.xpath('//*[@id="detailsbd"]/div[1]/div[2]/div/div[1]/h1/text()')[0]
31         # print(name)
32 
33         # 通过抓包ajax得到一个可以发送的url和请求伪装的视频的url,
34         id_ = str(li.xpath('./div/a/@href')[0]).split('_')[1]
35         # 可发送请求的url
36         ajax_url = 'https://www.pearvideo.com/videoStatus.jsp?'
37         params = {
38             'contId': id_,
39             'mrd': str(random.random())
40         }
41         ajax_headers = {
42             "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3823.400 QQBrowser/10.7.4307.400',
43             'Referer': 'https://www.pearvideo.com/video_' + id_
44         }
45         # 加了'Referer': 'https://www.pearvideo.com/video_1708144'后就不会显示该视频已下架了
46         dic_obj = requests.get(url=ajax_url, params=params, headers=ajax_headers).json()
47         video_url = dic_obj["videoInfo"]['videos']["srcUrl"]
48 
49         # 此处视频地址做了加密即ajax中得到的地址需要加上cont-,并且修改一段数字为id才是真地址
50         # 真地址:"https://video.pearvideo.com/mp4/third/20201120/cont-1708144-10305425-222728-hd.mp4"
51         # 伪地址:"https://video.pearvideo.com/mp4/third/20201120/1606132035863-10305425-222728-hd.mp4"
52 
53         # 得到真url,做字符串处理
54         video_true_url = ''
55         s_list = str(video_url).split('/')
56         # print(s_list)
57         for i in range(0, len(s_list)):
58             if i < len(s_list) - 1:
59                 video_true_url += s_list[i] + '/'
60             else:
61                 ss_list = s_list[i].split('-')
62                 # print(ss_list)
63                 for j in range(0, len(ss_list)):
64                     if j == 0:
65                         video_true_url += 'cont-' + id_ + '-'
66                     elif j == len(ss_list) - 1:
67                         video_true_url += ss_list[j]
68                     else:
69                         video_true_url += ss_list[j] + '-'
70         # print(video_true_url)
71 
72         dic = {
73             'name': name,
74             'url': video_true_url
75         }
76         urls.append(dic)
77 
78     # 使用线程池对视频数据进行请求(较为耗时的阻塞操作)
79     def get_video_data(dic_):
80         url_ = dic_['url']
81         print(dic_['name'], '正在下载.....')
82         video_data = requests.get(url=url_, headers=headers).content
83         video_path = './video/' + dic_['name']
84         with open(video_path, 'wb') as fp:
85             fp.write(video_data)
86             print(dic_['name'], '下载成功!!!!!')
87 
88 
89     pool = Pool(4)
90     pool.map(get_video_data, urls)
91 
92     pool.close()
93     pool.join()

 

posted @ 2020-11-23 21:26  千户-233  阅读(11777)  评论(1编辑  收藏  举报