路飞学城爬虫梨视频
最新增加了伪地址,这是我写的成功爬取的代码,增加了分页获取。
import random import re from multiprocessing import pool import requests from lxml import etree #爬取梨视频 headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36' } #解析视频(音乐热点排行榜) # url = 'https://www.pearvideo.com/popular_59' # page_text = requests.get(url=url,headers=headers).text #生成随机数,浮点类型 a = random.uniform(0, 1) #控制随机数的精度round(数值,精度),精度可以自行设定 # 保存视频的id contId = "" # 保存视频的所有下载链接 urls = [] for page in range(0,50,10): url = 'https://www.pearvideo.com/popular_loading.jsp' params = { 'reqType':'1', 'categoryId':'', 'start':'{}'.format(page), 'sort':'10', 'mrd':(round(a, 16)) #随机数 }#因为是get请求 page_text = requests.get(url=url,params=params,headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('/html/body/li') for li in li_list: detail_url ='https://www.pearvideo.com/' + li.xpath('./a/@href')[0] name = li.xpath('./div/a/h2/text()')[0] # print(detail_url,name)#测试能否解析到地址和标题 contId = detail_url.split('_',1)[1] #视频id # print(contId) # 创建了一个session对象来保持会话 session = requests.session() url = "https://www.pearvideo.com/videoStatus.jsp?contId={}&mrd=0.9193969693754276".format(contId) headers1 = { # 这里需要视频详情界面的url "Referer": "https://www.pearvideo.com/video_" + contId, "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43", } # 获取视频伪装过的下载链接 res = session.get(url=url, headers=headers1).json() # 被伪装的下载地址 down_url = res['videoInfo']['videos']['srcUrl'] # print(down_url) # print(name) #真实地址 https://video.pearvideo.com/mp4/adshort/20210309/cont-1722818-15627301_adpkg-ad_hd.mp4 #在 https://www.pearvideo.com/video_1722818 找到真实地址 #伪装地址 https://video.pearvideo.com/mp4/adshort/20210309/1615446624367-15627301_adpkg-ad_hd.mp4 # 需要被替换的模式 ex = "adshort/.*?/(.*?)-.*?" # 需要被替换的字符串 need_replace = re.findall(ex, down_url)[0] # 替换后的字符串 replaced = "cont-" + contId # 真实的下载地址 down_url = down_url.replace(need_replace, replaced) # print(down_url) # 将url和视频名称封装进url列表 dic = { "name": name + ".mp4", "url": down_url } urls.append(dic) # 下载视频 def down(urls): url = urls['url'] name = urls['name'] print(name, "正在下载") resPage = requests.get(url=url, headers=headers) print(resPage.status_code) # 视频存储的路径 videoPath = './video/' + name with open(videoPath, 'wb') as fp: fp.write(resPage.content) print(name, "下载完成") if __name__ == "__main__": mypool = pool.Pool(4) mypool.map(down, urls)

浙公网安备 33010602011771号