爬虫8-爬视频

# -*- coding: utf-8 -*-
"""
@Time    :  2022/3/23 16:13
@Author  : Andrew
@File    : 91视频.py
"""
import re

"""
    1.打开91日剧网站http://www.wwmulu.com/rj/wuyantuili/play-1-1.html,拿到网页源代码
    2.从源代码中提取m3u8的url
    3.下载m3u8文件。下载视频
    5.合并视频
"""
# 页面的       http://www.wwmulu.com/rj/wuyantuili/play-1-1.html   http://www.wwmulu.com
# 检查网页源代码/play.html?u=https://new.iskcd.com/20220111/38FCVqzP/1100kb/hls/index.m3u8(这个也可获取到)
#
# /play.html?u=https://new.iskcd.com/20220111/38FCVqzP/index.m3u8
# 抓包      https://new.iskcd.com/20220111/38FCVqzP/index.m3u8
# 它的返回就是/20220111/38FCVqzP/1100kb/hls/index.m3u8 ?_=1648026917721
# 将上述两者拼凑后就是真正的m3u8文件
# https://new.iskcd.com/20220111/38FCVqzP/1100kb/hls/index.m3u8
import requests
from lxml import etree

"""第一部分"""
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 "
                  "Safari/537.36 Core/1.77.97.400 QQBrowser/10.9.4621.400 ",
    # 防盗链  -溯源 当前请求的上一级是谁
    # "Referer": url
}
# url = "http://www.wwmulu.com/rj/wuyantuili/play-1-1.html"
# resp1 = requests.get(url, headers=headers)
# resp1.encoding = "utf-8"
# html = etree.HTML(resp1.text)
# # 通过m3u8_1以及返回的数据进行拼凑得出最终的m3u8_2
# m3u8_1 = html.xpath('//*[@id="cms_player"]/iframe/@src')[0].split("=")[-1]
# tvName = html.xpath('/html/body/div[2]/div/div[2]/div/div[2]/span[2]/@data-vod_name')[0]+html.xpath('/html/body/div[2]/div/div[2]/div/div[2]/span[2]/@data-playtitle')[0]
# resp2 = requests.get(m3u8_1, headers=headers)
# content = resp2.text.replace("\n", " ").split(" ")[2]
# m3u8_2 = "https://new.iskcd.com/" + content
# # 下载m3u8_2
# resp3 = requests.get(m3u8_2, headers=headers)
# with open("./91日剧/"+tvName+".m3u8", mode="wb") as f:
#     f.write(resp3.content)
# resp1.close()
# resp2.close()
# resp3.close()
"""第二部分"""
import aiohttp
import asyncio
import aiofiles

"""多线程,但是不知道是真大还是啥,最后还是超时了"""
# async def download(url):
#     # AttributeError: __aexit__ 原因是aiohttp.ClientSession没带()
#     async with aiohttp.ClientSession() as session:
#         async with session.get(url, headers=headers) as resp4:  # resp4 = requests.get(url)
#             # print(resp4)
#             name = url.split("hls/")[1].split(".")[0]
#             # print(name)
#             # with open("./91日剧/勿言推理第01集_ts/" + name + ".ts",
#             #           mode="wb") as f:
#             #     f.write(await resp4.content.read())
#             #     print(url, ":下载结束")
#             async with aiofiles.open("./91日剧/勿言推理第01集_ts/" + name + ".ts",
#                                      mode="wb") as f:
#                 await f.write(await resp4.content.read())
#                 print(url, ":下载结束")
#
#
# async def getTasks():
#     tasks = []
#     with open("./91日剧/勿言推理第01集.m3u8", mode="r", encoding="utf-8") as f:
#         for line in f:
#             # 去掉换行符、空白、空格
#             line = line.strip()
#             # 如果#开头,我不要
#             if line.startswith("#"):
#                 continue
#             # 准备异步任务下载视频片段
#             # print(line)
#             tasks.append(download(line))
#     await asyncio.wait(tasks)


# if __name__ == "__main__":
#     asyncio.run(getTasks())  # 开启异步调用

"""单线程"""
n = 1
with open("./91日剧/勿言推理第01集.m3u8", mode="r", encoding="utf-8") as f:
        for line in f:
            # 去掉换行符、空白、空格
            line = line.strip()
            # 如果#开头,我不要
            if line.startswith("#"):
                continue
            # 准备异步任务下载视频片段
            # print(line)
            resp4 = requests.get(line, headers=headers)
            # name = line.split("hls/")[1].split(".")[0]
            with open(f"./91日剧/勿言推理第01集_ts/{n}.ts",
                      mode="wb") as f:
                f.write(resp4.content)
            print(n, ":下载结束")
            n +=1

 

posted @ 2022-03-23 21:48  乔十六  阅读(184)  评论(0)    收藏  举报