bilibili批量下载

学习一下如何用爬虫爬取b站视频。一开始准备在b站上下载的,但是没找到下载按钮,就自己研究一下。
本来是准备用他网页的blob开头的地址转成可下载链接的,结果发现他这段是js后续生成的。解决方法也有,搞个chrome驱动,然后用selenium等待页面加载完成再去获取页面数据。但是有点麻烦,而且chrome还不能升级,升级了得同步更新chrome驱动。
然后去网上找了一下其他大神的操作,直接分析接口找到了下载视频和音频的两个接口地址,然后直接请求这俩地址,再通过ffmpeg合成一个视频。既然有现成的轮子就直接拿过来学习学习了。
不过有几个坑,直接抓去网页标题作为视频标题的时候要注意一些字符,可能会导致文件保存失败,可以用正则表达式简单的过滤一下。
别的坑好像也没啥,直接看代码也能理解。代码贴在下面了,懒人如果直接要用的话,下几个依赖库,改一下视频地址就能直接下载了。标记了视频下载进度,如果中间出错了,在range里面重新定义下范围就好了。

import json
import os
import re
import requests
from concurrent.futures import ThreadPoolExecutor
from random import choice
from lxml import etree # type: ignore

#设置请求头等参数,防止被反爬
headers = {
   'Accept': '*/*',
   'Accept-Language': 'en-US,en;q=0.5',
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
}
# 文件保存路径
path = "G:/"
def get_user_agent():
   '''获取随机用户代理'''
   user_agents = [
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
       "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
       "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
       "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
       "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
       "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
       "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
       "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
       "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
       "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
       "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
       "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
       "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
       "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
       "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
       "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
       "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36",
       "Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20",
       "Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
       "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
   ]
   # 在user_agent列表中随机产生一个代理,作为模拟的浏览器
   user_agent = choice(user_agents)
   return user_agent

def download_video_single(referer_url, video_url, audio_url, video_name):
    '''单个视频下载'''
    # 更新请求头
    headers.update({"Referer": referer_url})

    received_video = 0
    fileName = '%s_video.mp4' % video_name
    if not os.path.exists(fileName):
        print("视频下载开始:%s" % video_name)
        # 下载并保存视频
        video_content = requests.get(video_url, headers=headers)
        print('%s\t:' % video_name, round(int(video_content.headers.get('content-length', 0)) / 1024 / 1024, 2), '\tMB')
        with open('%s_video.mp4' % video_name, 'ab') as output:
            headers['Range'] = 'bytes=' + str(received_video) + '-'
            response = requests.get(video_url, headers=headers)
            output.write(response.content)

    fileName = '%s_audio.mp4' % video_name
    if not os.path.exists(fileName):
    # 下载并保存音频
        audio_content = requests.get(audio_url, headers=headers)
        print('%s\t音频大小:' % video_name, round(int(audio_content.headers.get('content-length', 0)) / 1024 / 1024, 2), '\tMB')
        received_audio = 0
        with open('%s_audio.mp4' % video_name, 'ab') as output:
            headers['Range'] = 'bytes=' + str(received_audio) + '-'
            response = requests.get(audio_url, headers=headers)
            output.write(response.content)
            received_audio += len(response.content)
        print("视频下载结束:%s" % video_name)
    video_audio_merge_single(video_name)  

def single_download(aid, acc_quality, index):
    '''单个视频实现下载'''
    # 请求视频链接,获取信息
    origin_video_url = 'https://www.bilibili.com/video/' + aid
    res = requests.get(origin_video_url, headers=headers)
    html = etree.HTML(res.text)
    title = html.xpath('//*[@id="mirror-vdcon"]/div[2]/div/div[6]/div[1]/div[2]/div/div[{}]/div[1]/div[2]/text()'.format(index))[0]
    # 使用下面方法获取标题时,记得去除标题里的非法字符。
    # collections_name = path+html.xpath('//*[@id="viewbox_report"]/div[1]/div/h1/text()')[0]
    collections_name = path+"videos"
    print('您当前正在下载:', title)
    if not os.path.exists(collections_name):
        os.mkdir(collections_name)

    video_info_temp = re_video_info(res.text, '__playinfo__=(.*?)</script><script>')
    video_info = {}
    # 获取视频质量
    quality = video_info_temp['data']['accept_description'][acc_quality]
    # 获取视频时长
    video_info['duration'] = video_info_temp['data']['dash']['duration']
    # 获取视频链接
    video_url = video_info_temp['data']['dash']['video'][acc_quality]['baseUrl']
    # 获取音频链接
    audio_url = video_info_temp['data']['dash']['audio'][acc_quality]['baseUrl']
    # 计算视频时长
    video_time = int(video_info.get('duration', 0))
    video_minute = video_time // 60
    video_second = video_time % 60
    print('当前视频清晰度为{},时长{}分{}秒'.format(quality, video_minute, video_second))
    # 调用函数下载保存视频
    download_video_single(origin_video_url, video_url, audio_url, collections_name+"/"+title.replace(" ", ""))

def re_video_info(text, pattern):
    '''利用正则表达式匹配出视频信息并转化成json'''
    match = re.search(pattern, text)
    # print(match.group(1))
    return json.loads(match.group(1))

def remove_special_chars(text):
    pattern = r'[^\w\u4e00-\u9fa5.]'
    return re.sub(pattern, '', text)

def video_audio_merge_single(video_name):
    '''使用ffmpeg单个视频音频合并'''
    print("视频合成开始:%s" % video_name)
    import subprocess
    command = 'ffmpeg -i %s_video.mp4 -i %s_audio.mp4 -c copy %s.mp4 -y -loglevel error' % (
        video_name, video_name, video_name)
    progress = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True,  encoding='utf-8')
    

    while True:
        output = progress.stdout.readline()
        if output == '' and progress.poll() is not None:
            break
        if output:
            print(output.strip())
        # 你可以在这里添加代码来解析特定的进度信息,例如使用正则表达式
        match = re.search(r'frame=(\d+) *', output)
        if match:
            frame_number = int(match.group(1))
            print(f'Processed frame: {frame_number}')

    delete_file('%s_video.mp4' % video_name)
    delete_file('%s_audio.mp4' % video_name)

def delete_file(filename):
    if os.path.exists(filename):
        os.remove(filename)
        print(f"文件 {filename} 删除成功!")
    else:
        print(f"文件 {filename} 不存在。")

if '_main_':
    for i in range(51):
        print("当前准备下载第{}个视频".format(i))
        single_download("BV1sw411971F?spm_id_from=333.788.videopod.episodes&p={}".format(i+1), 0, i+1)
posted @ 2025-03-30 21:42  阿飞飞啊飞  阅读(75)  评论(1)    收藏  举报