import time
# from pyquery import PyQuery as pq
import commonMethod
import datetime
import requests
import re
import os
import json
pattern_1 = '<source class="" src="//v3-web.douyinvod.com/(.{486,488}) type="">'
# pattern_2 = '<source class="" src="//v26-web.douyinvod.com/(.{486,488}) type="">'
# pattern_3 = '<source class="" src="//www.douyin.com/aweme/v1/play/(.+) type="">'
pattern_4='https://www.douyin.com/video/(\d+)'
def get_info_by_pattern(text,pattern):
p = re.compile(pattern)
p_res = p.findall(text)
return p_res
def get_headdouyinvod_com():
headers = {
"Connection": "keep-alive",
"Pragma": "no-cache",
"Cache-Control": "no-cache",
"sec-ch-ua": "\"Google Chrome\";v=\"95\", \"Chromium\";v=\"95\", \";Not A Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8"
}
return headers
# 获取抖音链接
def get_douyin_url(driver1,sharl_url):
try:
if 'https' not in sharl_url:
sharl_url = 'https://www.douyin.com/video/' + sharl_url
VID = ''
p_res = get_info_by_pattern(sharl_url, pattern_4)
if len(p_res) > 0:
VID = p_res[0]
# driver1 = commonMethod.getDriver('',False)
url = 'https://www.douyin.com'
driver1.get(url)
# driver1.delete_all_cookies()
list_cooke = [
{'domain': '.douyin.com', 'expiry': 1698999663, 'httpOnly': False, 'name': 'VIDEO_FILTER_MEMO_SELECT',
'path': '/', 'secure': False, 'value': '%7B%22expireTime%22%3A1698999663897%2C%22type%22%3A1%7D'},
...]
for cook in list_cooke:
driver1.add_cookie(cook)
time.sleep(5)
driver1.refresh()
# cookies = driver1.get_cookies()
# print(cookies)
# sharl_url = 'https://www.douyin.com/video/7294079788010999040'
driver1.get(sharl_url)
time.sleep(5)
txt = driver1.execute_script("return document.documentElement.outerHTML")
txt = txt.replace('amp;', '')
# txt = pq(selenium_html)
# pattern_1 = '<source class="" src="//v3-web.douyinvod.com/(.+) type="">'
# pattern_2 = '<source class="" src="//v26-web.douyinvod.com/(.+) type="">'
p_res = get_info_by_pattern(txt, pattern_1)
if len(p_res) > 0:
p_res = p_res[0].replace('"', '')
# print(len(p_res))
herf1 = 'https://v3-web.douyinvod.com/' + p_res
print(herf1)
return herf1,VID
except Exception as ex:
print('获取链接发生异常:',sharl_url,ex)
return '',''
# p_res = get_info_by_pattern(txt, pattern_2)
# if len(p_res) > 0:
# p_res = p_res[0].replace('"', '')
# # print(len(p_res))
# herf1 = 'https://v26-web.douyinvod.com/' + p_res
# print(herf1)
# return herf1,VID
# p_res = get_info_by_pattern(txt, pattern_3)
# if len(p_res) > 0:
# p_res = p_res[0].replace('"', '')
# print(len(p_res))
# herf1 = 'https://www.douyin.com/aweme/v1/play/' + p_res
# print(herf1)
# return herf1,VID
# return '',VID
#下载抖音文件
def dowfile_v3_web_douyinvod_com(file_url, fileName):
page_size = 1024 * 128
# url = "https://v3-web.douyinvod.com/4643edd0f5d68ed70fd6c6681f98ecdc/653b7807/video/tos/cn/tos-cn-ve-15c001-alinc2/ossMXydUtABkQozrA7g6NnEgHfeAOn9BkDhIAz/"
# url='https://v3-web.douyinvod.com/4643edd0f5d68ed70fd6c6681f98ecdc/653b7807/video/tos/cn/tos-cn-ve-15c001-alinc2/ossMXydUtABkQozrA7g6NnEgHfeAOn9BkDhIAz/?a=6383&ch=26&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=893&bt=893&cs=0&ds=3&ft=GN7rKGVVywIiRZm8Zmo~1u249EAp2yfbEvrK3ugu0mo0g3&mime_type=video_mp4&qs=1&rc=ZTpoODQzZmg3NGU8NjdlN0BpamZ4dGY6Zm92bjMzNGkzM0A0MDMvMjFgNTYxNmEuY2FgYSNtNi1ncjRvYXBgLS1kLTBzcw%3D%3D&btag=e00008000&dy_q=1698392554&feature_id=46a7bb47b4fd1280f3d3825bf2b29388&l=20231027154234B07D3936DE469D034161'
headers = get_headdouyinvod_com()
date = datetime.datetime.now()
strstart = date.strftime("%Y-%m-%d-%H-%M-%S")
print('开始下载:', strstart)
try:
response = requests.get(file_url, headers=headers, stream=True)
# response = requests.get(url, headers=headers, params=params, stream=True)
if response.status_code == 200 or response.status_code == 206:
headers = json.loads(str(response.headers).replace('"', '').replace('\'', '"'))
print(headers)
with(open(fileName, 'ab')) as f:
for chunk in response.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
if 'Content-Range' in headers:
# Content_Range = str(headers['Content-Length'])
# total_length = int(Content_Range)
Content_Range = str(headers['Content-Range'])
total_length = int(Content_Range.split('/')[1])
if total_length > page_size:
rest_size = total_length - page_size
loop_count = int(rest_size / page_size)
if rest_size % page_size > 0:
loop_count += 1
if loop_count > 0:
flag_minus = False
for i in range(1, loop_count + 1):
loop_index = i
if flag_minus:
loop_index -= 1
sfrom = page_size * loop_index + 1
sto = page_size * (loop_index + 1)
headers['Range'] = 'bytes=%d-%d' % (sfrom, sto)
print(loop_index, loop_count)
try:
# proxy_one = getIP()
# proxies = {"https": "https://" + proxy_one}
response = requests.get(url, headers=headers, stream=True)
if response.status_code == 200 or response.status_code == 206:
headers = json.loads(str(response.headers).replace('"', '').replace('\'', '"'))
# print(headers)
with(open(fileName, 'ab')) as f:
for chunk in response.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
flag_minus = False
except Exception as ex:
print(ex)
time.sleep(10)
if str(ex).find('Max retries exceeded with url') > -1:
print('Max retries exceeded with url')
flag_minus = True
time.sleep(1)
date = datetime.datetime.now()
strend = date.strftime("%Y-%m-%d %H:%M:%S")
print('完成下载:', strstart, strend)
except Exception as ex:
print('下载文件发生异常:',file_url,ex)
# 下载抖音文件,
def down_file(sharl_url_list,strDirectory):
'''
下载抖音文件
:param sharl_url: 抖音详细链接列表,类似:['https://www.douyin.com/video/7294079788010999040','https://www.douyin.com/video/7293552737067928868']
:param strDirectory: 抖音文件存储路径,类似:D:/douyin_file_down/202310
:return:
'''
# strDirectory = os.getcwd()
# sharl_url = 'https://www.douyin.com/video/7294079788010999040'
try:
driver1 = commonMethod.getDriver('', False)
for sharl_url in sharl_url_list:
file_url, VID = get_douyin_url(driver1, sharl_url)
# file_url, VID ='https://v3-web.douyinvod.com/4643edd0f5d68ed70fd6c6681f98ecdc/653b7807/video/tos/cn/tos-cn-ve-15c001-alinc2/ossMXydUtABkQozrA7g6NnEgHfeAOn9BkDhIAz/?a=6383&ch=26&cr=3&dr=0&lr=all&cd=0%7C0%7C0%7C3&cv=1&br=893&bt=893&cs=0&ds=3&ft=GN7rKGVVywIiRZm8Zmo~1u249EAp2yfbEvrK3ugu0mo0g3&mime_type=video_mp4&qs=1&rc=ZTpoODQzZmg3NGU8NjdlN0BpamZ4dGY6Zm92bjMzNGkzM0A0MDMvMjFgNTYxNmEuY2FgYSNtNi1ncjRvYXBgLS1kLTBzcw%3D%3D&btag=e00008000&dy_q=1698392554&feature_id=46a7bb47b4fd1280f3d3825bf2b29388&l=20231027154234B07D3936DE469D034161','7294079788010999040'
fileName = os.path.join(strDirectory, VID + '.mp4')
if len(file_url) > 0:
dowfile_v3_web_douyinvod_com(file_url, fileName)
driver1.close()
except Exception as ex:
print(ex)
if __name__ == "__main__":
# strDirectory = os.getcwd()
# 保存路径
strDirectory = 'D:/douyin/file/202310'
# sharl_url='https://www.douyin.com/video/7293555365818453274'
sharl_url_list=['https://www.douyin.com/video/7293555365818453274',
'https://www.douyin.com/video/7293552737067928868',
'https://www.douyin.com/video/7293555206388780324']
down_file(sharl_url_list, strDirectory)