Python搜索B站视频并且爬取视频
目前遍历循环仍未完成,所以只会下载第一个结果,后续会完善成接口可以做单独调用,其中还有音频和视频的合并,需要先安装ffmpeg环境
2021/1/23
更新日志:
1、完成该页的视频下载
2、部分没有资源的视频做了容错处理
3、添加了合并视频以后去掉了未合成的文件
1 #!/usr/bin/env python 2 # -*- coding: utf-8 -*- 3 # @Author : 黑羽青衣 4 # @File : ${NAME}.py 5 6 import requests 7 from urllib import parse,request 8 import urllib.request 9 from bs4 import BeautifulSoup 10 import re 11 import os 12 import subprocess 13 import time 14 import json 15 import sys 16 import io 17 import ffmpeg 18 19 sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf-8') 20 21 22 class BiliBili(object): 23 def __init__(self, url, keyword): 24 self.url = url 25 self.keyword = keyword 26 27 28 def html(self, url): 29 headers = { 30 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43', 31 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 32 'Accept-Language': 'zh-CN,zh;q=0.8', 33 'Connection': 'keep-alive', 34 } 35 html = requests.get(url, headers=headers) 36 html = html.text 37 #print(html) 38 return html 39 40 def get_video_html(self,url): 41 headers = { 42 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43', 43 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 44 'Accept-Language': 'zh-CN,zh;q=0.8', 45 'Connection': 'keep-alive', 46 # 'Cookie':cookie 47 } 48 response = requests.get(url, headers=headers) 49 video_html = response.text 50 return video_html 51 52 def get_video_info(selfs, html): 53 #print(html) 54 result = re.findall('<script>window.__playinfo__=(.*?)</script>', html, re.S)[0] 55 html_data = json.loads(result) 56 #print(html_data) 57 download_video_url = html_data['data']['dash']['video'][0]['backup_url'][0] 58 return download_video_url 59 60 def get_audio_info(self, html): 61 result = re.findall('<script>window.__playinfo__=(.*?)</script>', html, re.S)[0] 62 html_data = json.loads(result) 63 download_audio_url = html_data['data']['dash']['audio'][0]['backup_url'][0] 64 return download_audio_url 65 66 def search_video_info(self, html): 67 soup = BeautifulSoup(html,"html.parser") 68 video_info = {} 69 for tag in soup.find_all('div', class_='info'): 70 title = tag.find('a',class_='title').get_text() 71 people_num = tag.find('span', class_='so-icon watch-num').get_text() 72 up_name = tag.find('a', class_='up-name').get_text() 73 video_url = tag.find('a').get('href') 74 video_url = video_url.replace('//','') 75 video_info[title] = video_url 76 return video_info 77 78 def search_video(self, html): 79 video_info = self.search_video_info(html) 80 #print(video_info) 81 self.run_video(video_info ,url) 82 83 def run_search(self): 84 #获取搜索结果,根据搜索结果获得视频链接 85 html =self.html(url) 86 self.search_video(html) 87 88 89 def run_video(self,video_info,url): 90 # 根据结果传入来获得视频下载链接 91 video_size = 0 92 audio_size = 0 93 for title,video_url in video_info.items(): 94 get_video_html = self.get_video_html('https://' + video_url) 95 download_video_url = self.get_video_info(get_video_html) 96 download_audio_url = self.get_audio_info(get_video_html) 97 #print(title + ":" + video_url + ":" + download_video_url) 98 headers = { 99 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0', 100 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 101 'Referer': 'https://' + video_url, 102 'Accept-Encoding': "gzip, deflate, br", 103 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 104 'Connection': 'keep-alive', 105 # 'Cookie':cookie 106 } 107 try: 108 video_content = requests.get(download_video_url, stream=True, headers=headers) 109 mp4_file_size = int(video_content.headers['content-length']) 110 if video_content.status_code == 200: 111 print('[文件大小]:%0.2f MB' % (mp4_file_size / 1024 / 1024)) 112 with open(title + '.mp4', mode='wb') as mp4: 113 for chunk in video_content.iter_content(chunk_size=1024): 114 if chunk: 115 mp4.write(chunk) 116 video_size += len(chunk) # 已下载的文件大小 117 except BaseException: 118 print('凉凉,下载失败') 119 pass 120 try: 121 audio_content = requests.get(download_audio_url, stream=True, headers=headers) 122 mp3_file_size = int(audio_content.headers['content-length']) 123 if audio_content.status_code == 200: 124 print('[文件大小]:%0.2f MB' % (mp3_file_size / 1024 / 1024)) 125 with open(title + '.mp3', mode='wb') as mp3: 126 for chunk in audio_content.iter_content(chunk_size=1024): 127 if chunk: 128 mp3.write(chunk) 129 audio_size += len(chunk) 130 except BaseException: 131 print('凉凉,下载失败') 132 pass 133 134 print('正在保存:', title) 135 if os.path.exists(title + '.mp4'): 136 if self.video_audio_merge_single(title): 137 continue 138 time.sleep(60) 139 140 141 142 143 144 def video_audio_merge_single(self, video_name): 145 #合成视频 146 print('视频合成开始:',video_name) 147 ffm = r"D:\sofware\ffmpeg-4.3.1-2021-01-01-full_build\bin\ffmpeg.exe " 148 command = ffm + ' -i "{}.mp4" -i "{}.mp3" -vcodec copy -acodec copy "{}.mp4"'.format( 149 video_name, video_name, video_name + '(合)') 150 subprocess.Popen(command, shell=True) 151 print(command) 152 time.sleep(10) 153 print("视频合成结束:", video_name) 154 os.remove(f'{video_name}.mp3') 155 os.remove(f'{video_name}.mp4') 156 return True 157 158 159 if __name__ =='__main__': 160 url = 'https://search.bilibili.com/all?' 161 keyword = 'Python' 162 keyword = urllib.parse.quote(keyword) 163 param = 'keyword=' + keyword + '&from_source=nav_searchs' 164 url = url + param 165 BB = BiliBili(url, keyword) 166 BB.run_search()

浙公网安备 33010602011771号