• 博客园logo
  • 会员
  • 众包
  • 新闻
  • 博问
  • 闪存
  • 赞助商
  • HarmonyOS
  • Chat2DB
    • 搜索
      所有博客
    • 搜索
      当前博客
  • 写随笔 我的博客 短消息 简洁模式
    用户头像
    我的博客 我的园子 账号设置 会员中心 简洁模式 ... 退出登录
    注册 登录
山城小跳
博客园    首页    新随笔    联系   管理    订阅  订阅

python 如何抓取m3u8片段的电影

  • 第一步:查看url路径是否在页面源代码中,r
from urllib.parse import urljoin

import requests
import os
from concurrent.futures import ThreadPoolExecutor, wait
import re

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
}


def down_video(url, i):
    print(f"{i}.ts开始下载")
    # 下载ts文件
    while True:
        try:
            resp = requests.get(url, headers=headers)
            with open(os.path.join(path, str(i) + '.ts'), mode="wb") as f3:
                f3.write(resp.content)
            print(f"{i}.ts下载完成")
            break
        except Exception as e:
            print("下载失败,重新下载")

def download_all_videos(path, host):
    # 下载m3u8文件以及多线程下载ts文件
    if not os.path.exists(path):
        os.mkdir(path)
    # 开启线程 准备下载
    pool = ThreadPoolExecutor(max_workers=50)
    # 1. 读取文件
    tasks = []
    i = 0
    with open("index.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            # 如果不是url 则走下次循环
            if line.startswith("#"):
                continue
            line = host + line
            print(line, i)
            # 开启线程
            tasks.append(pool.submit(down_video, line.strip(), i))
            i += 1
    # 统一等待
    wait(tasks)


# 处理m3u8文件中的url问题
def do_m3u8_url(url, path, m3u8_filename="index.m3u8"):
    # 这里还没处理key的问题
    if not os.path.exists(path):
        os.mkdir(path)

    with open(m3u8_filename, mode="r", encoding="utf-8") as f:
        data = f.readlines()

    fw = open(os.path.join(path, m3u8_filename), 'w', encoding='UTF-8')
    abs_path = os.getcwd()
    i = 0
    for line in data:
        # 如果不是url 则走下次循环
        if line.startswith("#"):
            # 判断处理是存在需要秘钥
            if line.find('URI') != -1:
                # #EXT-X-KEY:METHOD=AES-128,URI="/20220622/5LnZiDXn/1500kb/hls/key.key"
                line = line.split('/')[0] + 'key.m3u8"\n'
                # 打印的line  #EXT-X-KEY:METHOD=AES-128,URI="key.m3u8"
                print("line", line)
                # url = 'https://s7.fsvod1.com/20220622/5LnZiDXn/1500kb/hls/index.m3u8'
                host = url.rsplit('/', 1)[0]
                # 打印host https://s7.fsvod1.com/20220622/5LnZiDXn/1500kb/hls
                print("host", host)
                # 爬取key
                download_m3u8(host + '/key.key', os.path.join(path, 'key.m3u8'))
            fw.write(line)
        else:
            fw.write(f'{i}.ts\n')
            i += 1


def download_m3u8(url, m3u8_filename="index.m3u8", state=0):
    print('正在下载index.m3u8文件')
    resp = requests.get(url=url, headers=headers)
    with open(m3u8_filename, mode="w", encoding="utf-8") as f:
      f.write(resp.text)


def merge(path, filename='output'):
    # 进行ts文件合并 解决视频音频不同步的问题 建议使用这种
    os.chdir(path)
    cmd = f'ffmpeg -i index.m3u8 -c copy {filename}.mp4'
    os.system(cmd)


def get_m3u8data(first_m3u8url):
    session = requests.Session()
    # 请求第一次m3u8de url
    resp = session.get(first_m3u8url, headers=headers)
    resp.encoding = 'UTF-8'
    data = resp.text
    # 第二次请求m3u8文件地址 返回最终包含所有ts文件的m3u8
    second_m3u8_url = urljoin(first_m3u8url, data.split('/', 3)[-1].strip())
    resp = session.get(second_m3u8_url, headers=headers)
    with open('index.m3u8', 'wb') as f:
        f.write(resp.content)
    return second_m3u8_url


if __name__ == '__main__':
    # ts文件存储目录
    path = 'ts'
    # 带加密的ts文件的 index.m3u8  url
    url = 'https://s7.fsvod1.com/20220622/5LnZiDXn/index.m3u8'  # 第一次m3u8 地址
    meu8_url = get_m3u8data(url)
    print(meu8_url)
    # # 下载m3u8文件以及ts文件
    # host = 'https://s7.fsvod1.com'  # 主机地址  用于拼凑完整的ts路径和秘钥路径
    # download_all_videos(path, host)
    # do_m3u8_url(meu8_url, path)
    # # 文件合并
    # merge(path, '奇异博士')
    print('over')```
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
转换格式 

import requests
import re
from urllib.parse import urljoin
import os
from concurrent.futures import ThreadPoolExecutor, wait

file_path = "ts_before"
if not os.path.exists(file_path):
    os.mkdir(file_path)

headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36"
}


# 获取网页中的src路径
def get_frame_src(url):
    session = requests.Session()
    resp = session.get(url=url, headers=headers)
    page_source = resp.text
    # print(page_source)
    src_url = re.search('"url":"(?P<url>.*?index.m3u8)"',page_source).group("url").replace("\\", "")
    # print(src_url)
    return src_url

# 获取到url请求下载first.m3u8文件
def get_firstm3u8(src_url):
    session = requests.session()
    resp = session.get(url=src_url, headers=headers)
    with open("first.m3u8", "w", encoding="utf-8")as f1:
        f1.write(resp.text)
    print("first.m3u8 文件下载完成")
    with open("first.m3u8", "r", encoding="utf-8")as f2:
        for line in f2:
            if line.startswith("#"):
                continue
            # 发现路径不完整,需要拼接路径
            line = line.strip()
            print(line)
            src = urljoin(src_url, line)
            response = session.get(url=src, headers=headers)
            print(response.text)
            with open("second.m3u8", "w", encoding="utf-8", newline="")as f3:
                f3.write(response.text)
            print("second.m3u8 文件下载完成")

# 下载一个png片段
def download_one_video(url, i):
    session = requests.session()
    resp = session.get(url=url, headers=headers)
    print(f"{i}.ts开始下载")
    with open(file_path+f"/{i}.ts", "wb")as f:
        f.write(resp.content)
    print(f"{i}.ts下载完成")

def download_all_video():
    pool = ThreadPoolExecutor(50)
    i = 0
    tasks = []
    with open("second.m3u8", "r", encoding="utf-8")as f:
        for line in f:
            if line.startswith("#"):
                continue
            line = line.strip()
            task = pool.submit(download_one_video, line,i)
            i += 1
            tasks.append(task)
    wait(tasks)

# 解析伪装成png的ts
def resolve_ts(src_path, dst_path):
    if not os.path.exists(dst_path):
        os.mkdir(dst_path)
    file_list = sorted(os.listdir(src_path), key=lambda x: int(x.split('.')[0]))
    for i in file_list:
        origin_ts = os.path.join(src_path, i)
        resolved_ts = os.path.join(dst_path, i)
        try:
            infile = open(origin_ts, "rb")  # 打开文件
            outfile = open(resolved_ts, "wb")  # 内容输出
            data = infile.read()
            outfile.write(data)
            outfile.seek(0x00)
            outfile.write(b'\xff\xff\xff\xff')
            outfile.flush()
            infile.close()  # 文件关闭
            outfile.close()
        except:
            pass
        """
        else:
            # 删除目录
            shutil.rmtree(src_path)
            # 将副本重命名为正式文件
            os.rename(dst_path, dst_path.rstrip('2'))
        """
        print('resolve ' + origin_ts + ' success')

# 合并
def merge(filePath, filename='output'):
    file_list = sorted(os.listdir(filePath), key=lambda x: int(x.split('.')[0]))
    print(file_list)
    # 排序后写入到文件中
    with open("./file_list.txt", "w") as f:
        for file in file_list:
            # 格式为  file ./new_ts/1.ts  ...
            #  file '����/7.ts'
            f.write("file '{}/{}'\n".format(filePath, file))
            # f.write(f"file./{filePath}/{file}")
    cmd = f'ffmpeg -f concat  -safe 0 -i file_list.txt -c copy {filename}.mp4'
    os.system(cmd)

if __name__ == '__main__':
    url = "https://www.9meiju.cc/mohuankehuan/shandianxiadibaji/1-6.html"
    # src_url = get_frame_src(url)
    # get_firstm3u8(src_url)
    # download_all_video()
    dst_path = "ts_after"
    resolve_ts(file_path, dst_path)
    merge(dst_path, "闪电侠6集")
posted @ 2022-08-30 19:14  字母一哥  阅读(1428)  评论(0)    收藏  举报
刷新页面返回顶部
博客园  ©  2004-2025
浙公网安备 33010602011771号 浙ICP备2021040463号-3