import re
import requests
response = requests.get("http://www.xiaohuar.com/v/")
url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S)
for url in url_s:
res = requests.get(url)
result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S)
# print(result)
def get_page(url):
try:
response = requests.get(url)
if response.status_code==200:
return response.text
except Exception:
pass
def parse_data(text):
url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S)
# list = []
for url in url_s:
if url:
yield url
def parse_detail(text):
try:
movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S)
if movie_url_list:
movie_url = movie_url_list[0]
if movie_url.endswith(".mp4"):
return movie_url
except Exception(TypeError):
pass
import uuid
def download_movie(movie_url):
try:
response=requests.get(movie_url)
# print(response.text)
with open (r"D:\spider1\movies\%s.mp4"%uuid.uuid4(),"wb")as f:
f.write(response.content)
except Exception:
pass
if __name__ == '__main__':
base_url = "http://www.xiaohuar.com/list-3-{}.html"
for line in range(1):
url=base_url.format(line)
#1 发送请求
index_text=get_page(url)
#2解析数据
urls = parse_data(index_text)
for url in urls:
#访问详情页获取详情页文本
detail_text = get_page(url)
movie_url = parse_detail(detail_text)
#保存视屏
download_movie(movie_url)
from concurrent.futures import ThreadPoolExecutor
pool = ThreadPoolExecutor(50)
response = requests.get("http://www.xiaohuar.com/v/")
# print(response.text)
url_s = re.findall('<div class="items">.*?href="(.*?)"',response.text,re.S)
for url in url_s:
# print(url)
res = requests.get(url)
result = re.findall('<video id="media".*?src="(.*?)"',res.text,re.S)
# print(result)
def get_page(url):
print(url)
try:
response = requests.get(url)
if response.status_code==200:
return response.text
except Exception:
pass
def parse(res):
text = res.result()
if text:
# print(text)
url_s = re.findall('<div class="items">.*?href="(.*?)"',text,re.S)
# list = []
for url in url_s:
if url:
if url.startswith("/"):
url = "http://www.xiaohuar.com"+url
pool.submit(get_page,url).add_done_callback(parse_detail)
def parse_detail(res):
text = res.result()
if text:
try:
movie_url_list = re.findall('<video id="media".*?src="(.*?)"',text,re.S)
if movie_url_list:
movie_url = movie_url_list[0]
if movie_url.endswith(".mp4"):
pool.submit(download_movie,movie_url)
except Exception(TypeError):
pass
import uuid
def download_movie(movie_url):
if movie_url:
try:
response=requests.get(movie_url)
# print(response.text)
with open (r"D:\spider1\movies\%s.mp4"%uuid.uuid4(),"wb")as f:
f.write(response.content)
except Exception:
pass
if __name__ == '__main__':
base_url = "http://www.xiaohuar.com/list-3-{}.html"
for line in range(2):
url=base_url.format(line)
#1 发送请求
pool.submit(get_page,url).add_done_callback(parse)