下载哔哩哔哩视频爬虫

 1 import json
 2 import requests
 3 import re
 4 
 5 
 6 class Bilibili(object):
 7     def __init__(self, goal_url):
 8         # 目标url
 9         self.url = goal_url
10         # 获取页面请求头
11         self.getHTMLHeaders = {
12             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
13                           "Chrome/69.0.3497.100 Safari/537 "
14         }
15         # 下载请求头
16         self.downloadHeaders = {
17             'Origin': 'https://www.bilibili.com',
18             'Referer': 'https://www.bilibili.com/video/av26522634',
19             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
20                           'Chrome/69.0.3497.100 Safari/537.36',
21         }
22 
23     # 获取页面
24     def get_source(self):
25         try:
26             response = requests.get(self.url, self.getHTMLHeaders)
27             if response.status_code == 200:
28                 return response.content.decode("utf-8")
29             else:
30                 print("获取页面失败...")
31                 return None
32         except requests.RequestException as e:
33             print(e)
34             return None
35 
36     # 解析页面
37     @staticmethod
38     def parse_source(source):
39         video_name = re.search('<title data-vue-meta="true">(.*?)</title>', source).group(1)
40         video_name = re.sub('[!。%¥#@*,、‘;’,./;【】`~ ]', '-', video_name)
41         # 只有一种画质时可偷懒
42         # video_url = re.search('"backup_url":\["(.*?)"]', source).group(1)
43         # video_url = re.search('"id"\:80\,"baseUrl":"(.*?)"', source).group(1)
44         # video_url = re.search('"backupUrl":\["(.*?)"\]', source).group(1)
45         # video_url = re.search('"base_url":"(.*?)"', source).group(1)
46         video_url = ''
47         pattern = r'\<script\>window\.__playinfo__=(.*?)\</script\>'
48         result = re.findall(pattern, source)[0]
49         temp = json.loads(result)
50         for item in temp['data']['dash']['video']:
51             video_url = item['baseUrl']
52         return{
53             "video_name": video_name,
54             "video_url": video_url
55         }
56 
57     # 下载并保存video
58     def save_video(self, video):
59         video_url = video["video_url"]
60         # stream参数设置立即下载响应内容
61         response = requests.get(video_url, headers=self.downloadHeaders, stream=True, verify=False)
62         video_name = video["video_name"] + '.flv'
63         with open(video_name, 'wb')as f:
64             f.write(response.content)
65 
66     # 统筹调用
67     def run(self):
68         print("正在获取页面...\n")
69         source = self.get_source()
70         print("正在解析title和url...\n")
71         video = self.parse_source(source)
72         self.save_video(video)
73         print("下载完成!")
74 
75 
76 # 目标url
77 url = "https://www.bilibili.com/video/av70414801"
78 # 创建对象
79 download_video = Bilibili(url)
80 # 调用
81 download_video.run()

 

import json
import requests
import re


class Bilibili(object):
def __init__(self, goal_url):
# 目标url
self.url = goal_url
# 获取页面请求头
self.getHTMLHeaders = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/69.0.3497.100 Safari/537 "
}
# 下载请求头
self.downloadHeaders = {
'Origin': 'https://www.bilibili.com',
'Referer': 'https://www.bilibili.com/video/av26522634',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/69.0.3497.100 Safari/537.36',
}

# 获取页面
def get_source(self):
try:
response = requests.get(self.url, self.getHTMLHeaders)
if response.status_code == 200:
return response.content.decode("utf-8")
else:
print("获取页面失败...")
return None
except requests.RequestException as e:
print(e)
return None

# 解析页面
@staticmethod
def parse_source(source):
video_name = re.search('<title data-vue-meta="true">(.*?)</title>', source).group(1)
video_name = re.sub('[!%#@*,、’,./;【】`~ ]', '-', video_name)
# 只有一种画质时可偷懒
# video_url = re.search('"backup_url":\["(.*?)"]', source).group(1)
# video_url = re.search('"id"\:80\,"baseUrl":"(.*?)"', source).group(1)
# video_url = re.search('"backupUrl":\["(.*?)"\]', source).group(1)
# video_url = re.search('"base_url":"(.*?)"', source).group(1)
video_url = ''
pattern = r'\<script\>window\.__playinfo__=(.*?)\</script\>'
result = re.findall(pattern, source)[0]
temp = json.loads(result)
for item in temp['data']['dash']['video']:
video_url = item['baseUrl']
return{
"video_name": video_name,
"video_url": video_url
}

# 下载并保存video
def save_video(self, video):
video_url = video["video_url"]
# stream参数设置立即下载响应内容
response = requests.get(video_url, headers=self.downloadHeaders, stream=True, verify=False)
video_name = video["video_name"] + '.flv'
with open(video_name, 'wb')as f:
f.write(response.content)

# 统筹调用
def run(self):
print("正在获取页面...\n")
source = self.get_source()
print("正在解析titleurl...\n")
video = self.parse_source(source)
self.save_video(video)
print("下载完成!")


# 目标url
url = "https://www.bilibili.com/video/av70414801"
# 创建对象
download_video = Bilibili(url)
# 调用
download_video.run()
posted @ 2020-02-07 19:19  辣条小王籽  阅读(65)  评论(0)    收藏  举报