python爬虫爬取梨视频——2021.11.21
话不多说,直接上代码,如果爬取代码失效了,可以留下言,我看到会改。
import requests
from lxml import etree
from multiprocessing.dummy import Pool
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/8'
}
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url, headers).text
tree = etree.HTML(page_text)
life_list = tree.xpath("//div[@class='vervideo-bd']/a/@href")
pool_len = len(life_list)
urls = []
for life in life_list:
name_url = "https://www.pearvideo.com/" + life
name_page_text = requests.get(name_url, headers = headers).text
tree = etree.HTML(name_page_text)
name = tree.xpath("//div[@id='poster']/img/@alt")[0]
video_ip = re.findall('^video_(.*)', life)[0]
detail_url= 'https://www.pearvideo.com/videoStatus.jsp?contId=' + video_ip
headers = {
'Referer': 'https://www.pearvideo.com/video_' + life,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.6241 SLBChan/8'
}
detail_page_text = requests.get(detail_url, headers = headers).text
errorUrl = re.findall('"srcUrl":"(.*)"', detail_page_text)[0]
reUrl = re.search('https://video.pearvideo.com/mp4/(.*?)/(.*?)/(.*?)-(.*)', errorUrl)
url1 = 'https://video.pearvideo.com/mp4/'
url2 = reUrl.group(1) + '/'
url3 = reUrl.group(2) + '/'
url4 = 'cont-' + video_ip + '-'
url5 = reUrl.group(4)
srcUrl = url1 + url2 + url3 + url4 + url5
dic = {
'name': name,
'url': srcUrl
}
urls.append(dic)
def get_video_data(dic):
url = dic['url']
data = requests.get(url = url, headers = headers).content
with open(dic['name'] + '.mp4', 'wb') as fp:
fp.write(data)
print(dic['name'], '下载成功')
pool = Pool(pool_len)
pool.map(get_video_data, urls)
pool.close()
pool.join()

浙公网安备 33010602011771号