bilibili批量下载
学习一下如何用爬虫爬取b站视频。一开始准备在b站上下载的,但是没找到下载按钮,就自己研究一下。
本来是准备用他网页的blob开头的地址转成可下载链接的,结果发现他这段是js后续生成的。解决方法也有,搞个chrome驱动,然后用selenium等待页面加载完成再去获取页面数据。但是有点麻烦,而且chrome还不能升级,升级了得同步更新chrome驱动。
然后去网上找了一下其他大神的操作,直接分析接口找到了下载视频和音频的两个接口地址,然后直接请求这俩地址,再通过ffmpeg合成一个视频。既然有现成的轮子就直接拿过来学习学习了。
不过有几个坑,直接抓去网页标题作为视频标题的时候要注意一些字符,可能会导致文件保存失败,可以用正则表达式简单的过滤一下。
别的坑好像也没啥,直接看代码也能理解。代码贴在下面了,懒人如果直接要用的话,下几个依赖库,改一下视频地址就能直接下载了。标记了视频下载进度,如果中间出错了,在range里面重新定义下范围就好了。
import json
import os
import re
import requests
from concurrent.futures import ThreadPoolExecutor
from random import choice
from lxml import etree # type: ignore
#设置请求头等参数,防止被反爬
headers = {
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
# 文件保存路径
path = "G:/"
def get_user_agent():
'''获取随机用户代理'''
user_agents = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20",
"Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
]
# 在user_agent列表中随机产生一个代理,作为模拟的浏览器
user_agent = choice(user_agents)
return user_agent
def download_video_single(referer_url, video_url, audio_url, video_name):
'''单个视频下载'''
# 更新请求头
headers.update({"Referer": referer_url})
received_video = 0
fileName = '%s_video.mp4' % video_name
if not os.path.exists(fileName):
print("视频下载开始:%s" % video_name)
# 下载并保存视频
video_content = requests.get(video_url, headers=headers)
print('%s\t:' % video_name, round(int(video_content.headers.get('content-length', 0)) / 1024 / 1024, 2), '\tMB')
with open('%s_video.mp4' % video_name, 'ab') as output:
headers['Range'] = 'bytes=' + str(received_video) + '-'
response = requests.get(video_url, headers=headers)
output.write(response.content)
fileName = '%s_audio.mp4' % video_name
if not os.path.exists(fileName):
# 下载并保存音频
audio_content = requests.get(audio_url, headers=headers)
print('%s\t音频大小:' % video_name, round(int(audio_content.headers.get('content-length', 0)) / 1024 / 1024, 2), '\tMB')
received_audio = 0
with open('%s_audio.mp4' % video_name, 'ab') as output:
headers['Range'] = 'bytes=' + str(received_audio) + '-'
response = requests.get(audio_url, headers=headers)
output.write(response.content)
received_audio += len(response.content)
print("视频下载结束:%s" % video_name)
video_audio_merge_single(video_name)
def single_download(aid, acc_quality, index):
'''单个视频实现下载'''
# 请求视频链接,获取信息
origin_video_url = 'https://www.bilibili.com/video/' + aid
res = requests.get(origin_video_url, headers=headers)
html = etree.HTML(res.text)
title = html.xpath('//*[@id="mirror-vdcon"]/div[2]/div/div[6]/div[1]/div[2]/div/div[{}]/div[1]/div[2]/text()'.format(index))[0]
# 使用下面方法获取标题时,记得去除标题里的非法字符。
# collections_name = path+html.xpath('//*[@id="viewbox_report"]/div[1]/div/h1/text()')[0]
collections_name = path+"videos"
print('您当前正在下载:', title)
if not os.path.exists(collections_name):
os.mkdir(collections_name)
video_info_temp = re_video_info(res.text, '__playinfo__=(.*?)</script><script>')
video_info = {}
# 获取视频质量
quality = video_info_temp['data']['accept_description'][acc_quality]
# 获取视频时长
video_info['duration'] = video_info_temp['data']['dash']['duration']
# 获取视频链接
video_url = video_info_temp['data']['dash']['video'][acc_quality]['baseUrl']
# 获取音频链接
audio_url = video_info_temp['data']['dash']['audio'][acc_quality]['baseUrl']
# 计算视频时长
video_time = int(video_info.get('duration', 0))
video_minute = video_time // 60
video_second = video_time % 60
print('当前视频清晰度为{},时长{}分{}秒'.format(quality, video_minute, video_second))
# 调用函数下载保存视频
download_video_single(origin_video_url, video_url, audio_url, collections_name+"/"+title.replace(" ", ""))
def re_video_info(text, pattern):
'''利用正则表达式匹配出视频信息并转化成json'''
match = re.search(pattern, text)
# print(match.group(1))
return json.loads(match.group(1))
def remove_special_chars(text):
pattern = r'[^\w\u4e00-\u9fa5.]'
return re.sub(pattern, '', text)
def video_audio_merge_single(video_name):
'''使用ffmpeg单个视频音频合并'''
print("视频合成开始:%s" % video_name)
import subprocess
command = 'ffmpeg -i %s_video.mp4 -i %s_audio.mp4 -c copy %s.mp4 -y -loglevel error' % (
video_name, video_name, video_name)
progress = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, encoding='utf-8')
while True:
output = progress.stdout.readline()
if output == '' and progress.poll() is not None:
break
if output:
print(output.strip())
# 你可以在这里添加代码来解析特定的进度信息,例如使用正则表达式
match = re.search(r'frame=(\d+) *', output)
if match:
frame_number = int(match.group(1))
print(f'Processed frame: {frame_number}')
delete_file('%s_video.mp4' % video_name)
delete_file('%s_audio.mp4' % video_name)
def delete_file(filename):
if os.path.exists(filename):
os.remove(filename)
print(f"文件 {filename} 删除成功!")
else:
print(f"文件 {filename} 不存在。")
if '_main_':
for i in range(51):
print("当前准备下载第{}个视频".format(i))
single_download("BV1sw411971F?spm_id_from=333.788.videopod.episodes&p={}".format(i+1), 0, i+1)

浙公网安备 33010602011771号