异步爬取网站

import requests
import re
import json
from lxml.etree import HTML
from multiprocessing.dummy import Pool
url='https://www.pearvideo.com/category_5'
session=requests.session()
headers = {
'user-agent':"Mozilla/5.0(WindowsNT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/88.0.4324.190Safari/537.36",
'cookie':'__secdyid=2679d2eb613ea37f4a9e57af99f27b26ca64220e3ad49054021615622670; PEAR_UUID=33f85c2b-9aab-4d31-af7a-8a7cb3e702d0; _uab_collina=161562267239630461871081; UM_distinctid=1782a9d936577a-039c7f320768c5-353c540f-1fa400-1782a9d9366721; p_h5_u=967CDCB3-85E2-48CC-A2B8-D8E8716C7B97; Hm_lvt_9707bc8d5f6bba210e7218b8496f076a=1615622673,1615775687; JSESSIONID=F5B074E4EABA1D3E714D8F53554CF36B; acw_tc=76b20f4416157786907688435e7758a2efe460ece2a55ce76f5069ee13cab1; CNZZDATA1260553744=272537789-1615618504-%7C1615777233; Hm_lpvt_9707bc8d5f6bba210e7218b8496f076a=1615779974; SERVERID=a6169b2e0636a71b774d6641c064eb8c|1615780345|1615775689',
# 'Referer': 'https://www.pearvideo.com/video_1723220'
}
# proxies={'https':'59.36.10.52:3128'}
respons=session.get(url=url,headers=headers).text
ter=HTML(respons)
list_li=ter.xpath('//*[@id="listvideoListUl"]/li')
# print(list_li)
data=[]
for i in list_li:
url_v='https://www.pearvideo.com/'+i.xpath('./div/a/@href')[0]
tada_url=url_v.split('_',1)[-1]
headers = {
'user-agent': "Mozilla/5.0(WindowsNT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/88.0.4324.190Safari/537.36",
# 'cookie': '__secdyid=2679d2eb613ea37f4a9e57af99f27b26ca64220e3ad49054021615622670; PEAR_UUID=33f85c2b-9aab-4d31-af7a-8a7cb3e702d0; _uab_collina=161562267239630461871081; UM_distinctid=1782a9d936577a-039c7f320768c5-353c540f-1fa400-1782a9d9366721; p_h5_u=967CDCB3-85E2-48CC-A2B8-D8E8716C7B97; Hm_lvt_9707bc8d5f6bba210e7218b8496f076a=1615622673,1615775687; JSESSIONID=F5B074E4EABA1D3E714D8F53554CF36B; acw_tc=76b20f4416157786907688435e7758a2efe460ece2a55ce76f5069ee13cab1; CNZZDATA1260553744=272537789-1615618504-%7C1615777233; Hm_lpvt_9707bc8d5f6bba210e7218b8496f076a=1615779974; SERVERID=a6169b2e0636a71b774d6641c064eb8c|1615780345|1615775689',
'Referer': 'https://www.pearvideo.com/video_'+tada_url
}
data_url_vido='https://www.pearvideo.com/'+'videoStatus.jsp?contId='+tada_url+'&mrd=0.17465945052790843'
# print(data_url_vido)
name=i.xpath('./div/a/div[2]/text()')[0]+'.mp4'
params={
'contId':tada_url,
'mrd':'0.17465945052790843',
}
resp=session.get(url=data_url_vido,params=params,headers=headers).json()
# print(resp)
vido_yrl = resp['videoInfo']['videos']['srcUrl']
replas=resp['systemTime']
v='cont-'+tada_url
url_data_text_divo=vido_yrl.replace(replas,v)
dic={
'name':name,
'url':url_data_text_divo,
}
data.append(dic)
def get_cont(dic):
url_data_text_divo=dic['url']

conten=session.get(url=url_data_text_divo,headers=headers).content
a=dic['name']
b=a.replace('昆明海埂大坝的红嘴鸥"铲屎官"',' ')
print(b)
with open(b,'wb') as f:
f.write(conten)
pool=Pool(4)
pool.map(get_cont,data)

pool.close()
pool.join()


# with open ('nihao.html','w',encoding='utf-8')as f:
# f.write(resp)
# print(resp)
# ex ='src="(.*?)" style='
# divo_page_text=re.findall(ex,resp)
# print(divo_page_text)
# print(resp)
# tt=resp.xpath('//*[@id="JprismPlayer"]/video/@src')
# print(tt)


'''
<video webkit-playsinline="" playsinline="" x-webkit-airplay="" autoplay="autoplay"

src="https://video.pearvideo.com/mp4/third/20210312/cont-1723220-15690592-205220-hd.mp4"
style="width: 100%; height: 100%;"></video>

ex = '^src="(.*?)".*?</video>$ '

'''

# https://video.pearvideo.com/mp4/third/20210312/cont-1723084-10008579-094913-hd.mp4
# https://video.pearvideo.com/mp4/third/20210312/1615777021671-10008579-094913-hd.mp4

'''

https://www.pearvideo.com/videoStatus.jsp?contId=1723084&mrd=0.7401076643041862 0.6216678891153828 0.3593407426489732
https://www.pearvideo.com/videoStatus.jsp?contId=1723059&mrd=0.9479653804657795
https://www.pearvideo.com/videoStatus.jsp?contId=1723059&mrd=0.3543527845268857


'''

# url='https://video.pearvideo.com/mp4/third/20210312/1615629788048-10008579-094913-hd.mp4'
# he={
# 'Cookie':'__secdyid=2679d2eb613ea37f4a9e57af99f27b26ca64220e3ad49054021615622670; JSESSIONID=28C4914FC9B62142058DDEA02E2CF6B8; PEAR_UUID=33f85c2b-9aab-4d31-af7a-8a7cb3e702d0; _uab_collina=161562267239630461871081; Hm_lvt_9707bc8d5f6bba210e7218b8496f076a=1615622673; UM_distinctid=1782a9d936577a-039c7f320768c5-353c540f-1fa400-1782a9d9366721; p_h5_u=967CDCB3-85E2-48CC-A2B8-D8E8716C7B97; CNZZDATA1260553744=272537789-1615618504-%7C1615623905; acw_tc=76b20f7216156271940256737e25711e66775a0a600e9944e8caa9ceed40f0; Hm_lpvt_9707bc8d5f6bba210e7218b8496f076a=1615628812; SERVERID=a6169b2e0636a71b774d6641c064eb8c|1615628815|1615622670',
# 'user-agent':"Mozilla/5.0(WindowsNT 6.1;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko)Chrome/88.0.4324.190Safari/537.36",
# # 'contId':'1723084',
# # 'mrd':'0.8273348737247268',
# # 'Referer': 'https://www.pearvideo.com/video_1723084',
# }
# params ={
# 'contId': '1723084',
# 'mrd': '0.8273348737247268',
#
# }
# r=requests.post(url=url,headers=he,proxies=proxies).json()
# r=requests.post(url=url,headers=he,proxies=proxies).content
# print(r)
# vido_yrl=r['videoInfo']['videos']['srcUrl']

posted @ 2021-03-28 14:24  mjth  阅读(302)  评论(0)    收藏  举报