路飞学城爬虫梨视频

最新增加了伪地址,这是我写的成功爬取的代码,增加了分页获取。

import random
import re
from multiprocessing import pool

import requests
from lxml import etree
#爬取梨视频
headers = {
       'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
#解析视频(音乐热点排行榜)
# url = 'https://www.pearvideo.com/popular_59'
# page_text = requests.get(url=url,headers=headers).text

#生成随机数,浮点类型
a = random.uniform(0, 1)
#控制随机数的精度round(数值,精度),精度可以自行设定
# 保存视频的id
contId = ""
# 保存视频的所有下载链接
urls = []
for page in range(0,50,10):
   url = 'https://www.pearvideo.com/popular_loading.jsp'
   params = {
       'reqType':'1',
       'categoryId':'',
       'start':'{}'.format(page),
       'sort':'10',
       'mrd':(round(a, 16)) #随机数
   }#因为是get请求
   page_text = requests.get(url=url,params=params,headers=headers).text
   tree = etree.HTML(page_text)
   li_list = tree.xpath('/html/body/li')
   for li in li_list:
       detail_url ='https://www.pearvideo.com/' + li.xpath('./a/@href')[0]
       name = li.xpath('./div/a/h2/text()')[0]
       # print(detail_url,name)#测试能否解析到地址和标题

       contId = detail_url.split('_',1)[1] #视频id
       # print(contId)
       # 创建了一个session对象来保持会话
       session = requests.session()
       url = "https://www.pearvideo.com/videoStatus.jsp?contId={}&mrd=0.9193969693754276".format(contId)
       headers1 = {
           # 这里需要视频详情界面的url
           "Referer": "https://www.pearvideo.com/video_" + contId,
           "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.80 Safari/537.36 Edg/86.0.622.43",
       }
       # 获取视频伪装过的下载链接
       res = session.get(url=url,  headers=headers1).json()
       # 被伪装的下载地址
       down_url = res['videoInfo']['videos']['srcUrl']
       # print(down_url)
       # print(name)
       #真实地址 https://video.pearvideo.com/mp4/adshort/20210309/cont-1722818-15627301_adpkg-ad_hd.mp4
       #在 https://www.pearvideo.com/video_1722818 找到真实地址
       #伪装地址 https://video.pearvideo.com/mp4/adshort/20210309/1615446624367-15627301_adpkg-ad_hd.mp4
       # 需要被替换的模式
       ex = "adshort/.*?/(.*?)-.*?"
       # 需要被替换的字符串
       need_replace = re.findall(ex, down_url)[0]
       # 替换后的字符串
       replaced = "cont-" + contId
       # 真实的下载地址
       down_url = down_url.replace(need_replace, replaced)
       # print(down_url)
       # 将url和视频名称封装进url列表
       dic = {
           "name": name + ".mp4",
           "url": down_url
       }
       urls.append(dic)
# 下载视频
def down(urls):
   url = urls['url']
   name = urls['name']
   print(name, "正在下载")
   resPage = requests.get(url=url, headers=headers)
   print(resPage.status_code)
   # 视频存储的路径
   videoPath = './video/' + name
   with open(videoPath, 'wb') as fp:
       fp.write(resPage.content)
   print(name, "下载完成")


if __name__ == "__main__":

   mypool = pool.Pool(4)
   mypool.map(down, urls)

 

posted @ 2021-03-11 15:53  Honglixi  阅读(185)  评论(0)    收藏  举报