爬虫8-爬视频
# -*- coding: utf-8 -*- """ @Time : 2022/3/23 16:13 @Author : Andrew @File : 91视频.py """ import re """ 1.打开91日剧网站http://www.wwmulu.com/rj/wuyantuili/play-1-1.html,拿到网页源代码 2.从源代码中提取m3u8的url 3.下载m3u8文件。下载视频 5.合并视频 """ # 页面的 http://www.wwmulu.com/rj/wuyantuili/play-1-1.html http://www.wwmulu.com # 检查网页源代码/play.html?u=https://new.iskcd.com/20220111/38FCVqzP/1100kb/hls/index.m3u8(这个也可获取到) # # /play.html?u=https://new.iskcd.com/20220111/38FCVqzP/index.m3u8 # 抓包 https://new.iskcd.com/20220111/38FCVqzP/index.m3u8 # 它的返回就是/20220111/38FCVqzP/1100kb/hls/index.m3u8 ?_=1648026917721 # 将上述两者拼凑后就是真正的m3u8文件 # https://new.iskcd.com/20220111/38FCVqzP/1100kb/hls/index.m3u8 import requests from lxml import etree """第一部分""" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 " "Safari/537.36 Core/1.77.97.400 QQBrowser/10.9.4621.400 ", # 防盗链 -溯源 当前请求的上一级是谁 # "Referer": url } # url = "http://www.wwmulu.com/rj/wuyantuili/play-1-1.html" # resp1 = requests.get(url, headers=headers) # resp1.encoding = "utf-8" # html = etree.HTML(resp1.text) # # 通过m3u8_1以及返回的数据进行拼凑得出最终的m3u8_2 # m3u8_1 = html.xpath('//*[@id="cms_player"]/iframe/@src')[0].split("=")[-1] # tvName = html.xpath('/html/body/div[2]/div/div[2]/div/div[2]/span[2]/@data-vod_name')[0]+html.xpath('/html/body/div[2]/div/div[2]/div/div[2]/span[2]/@data-playtitle')[0] # resp2 = requests.get(m3u8_1, headers=headers) # content = resp2.text.replace("\n", " ").split(" ")[2] # m3u8_2 = "https://new.iskcd.com/" + content # # 下载m3u8_2 # resp3 = requests.get(m3u8_2, headers=headers) # with open("./91日剧/"+tvName+".m3u8", mode="wb") as f: # f.write(resp3.content) # resp1.close() # resp2.close() # resp3.close() """第二部分""" import aiohttp import asyncio import aiofiles """多线程,但是不知道是真大还是啥,最后还是超时了""" # async def download(url): # # AttributeError: __aexit__ 原因是aiohttp.ClientSession没带() # async with aiohttp.ClientSession() as session: # async with session.get(url, headers=headers) as resp4: # resp4 = requests.get(url) # # print(resp4) # name = url.split("hls/")[1].split(".")[0] # # print(name) # # with open("./91日剧/勿言推理第01集_ts/" + name + ".ts", # # mode="wb") as f: # # f.write(await resp4.content.read()) # # print(url, ":下载结束") # async with aiofiles.open("./91日剧/勿言推理第01集_ts/" + name + ".ts", # mode="wb") as f: # await f.write(await resp4.content.read()) # print(url, ":下载结束") # # # async def getTasks(): # tasks = [] # with open("./91日剧/勿言推理第01集.m3u8", mode="r", encoding="utf-8") as f: # for line in f: # # 去掉换行符、空白、空格 # line = line.strip() # # 如果#开头,我不要 # if line.startswith("#"): # continue # # 准备异步任务下载视频片段 # # print(line) # tasks.append(download(line)) # await asyncio.wait(tasks) # if __name__ == "__main__": # asyncio.run(getTasks()) # 开启异步调用 """单线程""" n = 1 with open("./91日剧/勿言推理第01集.m3u8", mode="r", encoding="utf-8") as f: for line in f: # 去掉换行符、空白、空格 line = line.strip() # 如果#开头,我不要 if line.startswith("#"): continue # 准备异步任务下载视频片段 # print(line) resp4 = requests.get(line, headers=headers) # name = line.split("hls/")[1].split(".")[0] with open(f"./91日剧/勿言推理第01集_ts/{n}.ts", mode="wb") as f: f.write(resp4.content) print(n, ":下载结束") n +=1