案例01--抓取每日影视的m3u8视频

案例:抓取每日影视的m3u8视频

1 思路分析

练习:抓取 每日影视  首页/域名 https://sp.weoknow.com/

无耻之徒视频页:https://sp.weoknow.com/index.php/vod/play/id/28124/sid/1/nid/1.html


# 分析
0:获得 视频m3u8的入口
https://sp.weoknow.com/index.php/vod/play/id/28124/sid/1/nid/1.html

返回:正则匹配
    "url":"https://v4.dious.cc/share/CjqUrWJmQFUs4Tab"     # 本集的
    "url_next":"https://v4.dious.cc/share/hVE09pB2daIjWASR"   # 下一集的


1: 获得 index.m3u8的 url
https://v4.dious.cc/share/CjqUrWJmQFUs4Tab  

返回: 正则匹配
var playlist = '[{"url":"/20220515/jXanPsgX/1200kb/hls/index.m3u8"}]';   # 其实可以直接到 真正index.m3u8这里
var main = "/20220515/jXanPsgX/index.m3u8";
     


2: 获得 带加密的 index.m3u8的 url :    # 这步可以省略,因为第一步 可直接获得真实index.m3u8
https://v4.dious.cc/20220515/jXanPsgX/index.m3u8

返回: 
#EXTM3U
#EXT-X-STREAM-INF:PROGRAM-ID=1,BANDWIDTH=1200000,RESOLUTION=1280x720  # 指的是m3u8的视频参数,带宽为1200kb,分辨率为1280P
/20220515/jXanPsgX/1200kb/hls/index.m3u8



3:获得 后续的 ts文件的url
https://v4.dious.cc/20220515/jXanPsgX/1200kb/hls/index.m3u8

返回:
#EXTM3U
#EXT-X-VERSION:3
#EXT-X-TARGETDURATION:6
#EXT-X-PLAYLIST-TYPE:VOD
#EXT-X-MEDIA-SEQUENCE:0
#EXT-X-KEY:METHOD=AES-128,URI="/20220515/jXanPsgX/1200kb/hls/key.key"     # 加密方法为 AES-128,及解密key文件的位置地址
#EXTINF:3.127,
/20220515/jXanPsgX/1200kb/hls/FmqG3vxv.ts
#EXTINF:3.127,
/20220515/jXanPsgX/1200kb/hls/ffzf4KLr.ts

2 完整代码

import os.path
import random
import threading
import time

import requests
import re
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}


def get_main_ts_url(url):
    """
    获取视频的ts入口页,eg:https://v4.dious.cc/share/GDdSkH33j9Wn8Akx
    :param url:视频播放页的url
    :return: ts_url的前缀域名, 本集的ts入口页, 下一集的ts入口页
    """
    html = requests.get(url, headers=headers).text
    main_ts_url = re.search('"url":"(https:.+?)"', html, re.M).group(1).replace('\\', '')
    ts_domain = main_ts_url.rsplit('/', 2)[0]
    next_main_ts_url = re.search('"url_next":"(https:.+?)"', html, re.M).group(1).replace('\\', '')
    return ts_domain, main_ts_url, next_main_ts_url


def get_index_m3u8_url(ts_domain, main_ts_url):
    """
    获取每集视频的index_m3u8的 请求url
    :param ts_domain: ts_url的前缀域名
    :param main_ts_url: 请求ts入口页的url
    :return: 每集的 index_m3u8的 请求url
    """
    html = requests.get(main_ts_url, headers=headers).text
    index_m3u8_url = ts_domain + re.search('"url":"(.+?)"', html, re.M).group(1).replace('\\', '')
    return index_m3u8_url


def get_ts_url(index_m3u8_path, ts_domain):
    """
    :param index_m3u8_path: index.m3u8的路径
    :param ts_domain: ts_url的前缀域名
    :return: 后续ts文件的url列表,解密文件key的url
    """
    ts_url_list = []
    with open(index_m3u8_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('#EXT-X-KEY'):
                key_url = ts_domain + re.search('URI="(.+)"', line).group(1)
            elif line.startswith('/'):
                ts_url = ts_domain + line.strip('\n')
                ts_url_list.append(ts_url)

    return ts_url_list, key_url


def download_file(ts_dir_path, url, filename):
    """
    下载单个文件,eg: index.e3u8 和 key.m3u8
    :param ts_dir_path: 每集的ts目录
    :param url: 下载文件的url
    :param filename: 保存的文件名
    :return: 下载完成的文件完整路径
    """
    if not os.path.isdir(ts_dir_path):
        os.makedirs(ts_dir_path)

    html = requests.get(url, headers=headers)
    path = f'{ts_dir_path}/{filename}'
    with open(path, 'wb') as f:
        f.write(html.content)
    return path


def download_video_one(ts_dir_path, ts_url, filename):
    """
    下载单个的ts文件
    :param ts_url: ts文件路径
    :param ts_dir_path: 每集的ts目录
    :param filename: 按照0-n的保存ts文件 eg: 0.ts
    :return:
    """
    # print(f'{ts_dir_path}/{filename}.ts')
    if not os.path.exists(f'{ts_dir_path}/{filename}.ts'):
        time.sleep(random.randint(1, 3))
        html = requests.get(ts_url, headers=headers)
        print(os.getpid(), threading.current_thread().name, ts_url)
        with open(f'{ts_dir_path}/{filename}.ts', 'wb') as f:
            f.write(html.content)


def start_thread(ts_dir_path, ts_url_list):
    """
    开启多线程下载ts文件
    :param ts_dir_path: 每集的ts目录
    :param ts_url_list: ts文件的url列表
    :return:
    """
    with ThreadPoolExecutor(max_workers=50) as executor:
        for index, ts_url in enumerate(ts_url_list):
            executor.submit(download_video_one, ts_dir_path, ts_url, index)


def update_index_m3u8(ts_dir_path, index_m3u8_path):
    """
    修改下载好的index.m3u8文件,将解密key文件和 ts文件的路径  指向本地文件所在
    :param ts_dir_path: ts文件路径
    :param index_m3u8_path: index.m3u8文件路径
    :return:
    """
    i = 0

    with open(index_m3u8_path, 'r', encoding='utf-8') as f:
        data = f.readlines()

    with open(index_m3u8_path, 'w') as f:
        for line in data:
            if line.startswith('#EXT-X-KEY'):
                line = re.sub('"(.+?)"', f'"{ts_dir_path}\\key.m3u8"'.replace('\\', '/'), line)
            elif line.startswith('/'):
                line = f'{ts_dir_path}\\{i}.ts\n'
                i += 1
            f.write(line)


def merge_video(ts_dir_path, video_name):
    """
    合并ts文件,成mp4视频
    :param ts_dir_path:
    :param video_name:
    :return:
    """
    if not os.path.exists(ts_dir_path + '\\' + f'{video_name}.mp4'):
        os.chdir(ts_dir_path)
        os.system(f'ffmpeg -i index.m3u8 -c copy {video_name}.mp4')
    print(f'{video_name}.mp4 已经存在了!')


def remove_ts(ts_dir_path):
    """
    删除ts文件
    :param ts_dir_path:
    :return:
    """
    for *_, filenames in os.walk(ts_dir_path):
        for file_name in filenames:
            if re.match(r'(^\d+.ts)|(^.*.m3u8)', file_name):
                # 删除匹配到的文件
                os.remove(ts_dir_path + '\\' + file_name)


def run(ts_dir_path, url, video_name):
    """
    执行下载每集视频
    :param ts_dir_path:
    :param url: 每集的视频页url
    :param video_name: 合成mp4的视频名字
    :return:
    """
    print(os.getpid(), f'开始下载第{video_name}集')

    ts_domain, main_ts_url, _ = get_main_ts_url(url)
    index_m3u8_url = get_index_m3u8_url(ts_domain, main_ts_url)
    index_m3u8_path = download_file(ts_dir_path, index_m3u8_url, 'index.m3u8')  # 下载index.m3u8文件
    ts_url_list, key_url = get_ts_url(index_m3u8_path, ts_domain)
    download_file(ts_dir_path, key_url, 'key.m3u8')  # 下载key.m3u8文件
    
    start_thread(ts_dir_path, ts_url_list)  # 下载ts文件
    update_index_m3u8(ts_dir_path, index_m3u8_path)
    merge_video(ts_dir_path, video_name)
    # remove_ts(ts_dir_path)


def start():
    """
    开启多进程,下载多页
    :return:
    """
    root_path = 'E:\\Shameless\\'
    with ProcessPoolExecutor(max_workers=2) as executor:
        for i in range(2, 7):
            url = f'https://sp.weoknow.com/index.php/vod/play/id/28124/sid/1/nid/{i}.html'
            ts_dir_path = os.path.join(root_path, str(i), 'ts')
            executor.submit(run, ts_dir_path, url, str(i))


if __name__ == '__main__':
    start()
posted @ 2024-01-16 00:41  Edmond辉仔  阅读(702)  评论(0编辑  收藏  举报