python作业6月14日
# 爬虫原理 提取有用数据 保存
# 互联网
# 浏览器 发送http请求
# 梨视频:分析视频文件 保存到本地
# 今日内容
# request模块的详细使用
# selenium模块
# 1 请求库
# 2 解析库
# 3 存储库
import requests
import re
# response=requests.get(url='https://www.pearvideo.com/video_1566066')
# <a href="video_1461493" class="actplay openapp" target="_blank">
# <div class="video-main">
# <img class="img" src="https://image.pearvideo.com/cont/20181023/cont-1461493-11648067.png" alt="离婚后需要帮前妻还债吗?">
# <div class="vdo-time">02:35</div></div>
# <div class="vdo-summary">
# <p class="vdo-tt">离婚后需要帮前妻还债吗?</p>
# <div class="none vdo-open-tips"></div>
# </div>
# </a>
# re.findall('正则表达式','jiexiewenben','zhengzemoshi')
# print(response.status_code)
# print(response.text)
# res=re.findall('<a href="video_(.*?)"',response.text,re.S)
# # 。当前位置 *所有 ?找到就停下来
# print(res)
# for m_id in res:
# detail_url='https://www.pearvideo.com/video_'+m_id
# print(detail_url)
# 获取视频详情页
import uuid
# 1 发送请求
def get_page(url):
response=requests.get(url)
return response
# 2 解析数据
def parse_index(text):
res=re.findall('<a href="video_(.*?)"',text,re.S)
detail_url_list=[]
for m_id in res:
detail_url='https://www.pearvideo.com/video_'+m_id
detail_url_list.append(detail_url)
return detail_url_list
def parse_detail(text):
movie_url=re.findall('srcUrl="(.*?)"',text,re.S)[0]
return movie_url
# 3 保存数据
def save_movie(movie_url):
response=requests.get(movie_url)
with open(f'{uuid.uuid4()}.mp4','wb') as f:
f.write(response.content)
f.flush()
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import pool
if __name__ == "__main__":
# ThreadPoolExecutor
# index_res=get_page(url='https://www.pearvideo.com/')
# detail_url_list=parse_index(index_res.text)
# for detail_url in detail_url_list:
# detail_res =get_page(url=detail_url)
# movie_url=parse_detail(detail_res.text)
# print(movie_url)
# save_movie(movie_url)
# 导入线程池模块
url='https://www.pearvideo.com/'
pool.submit(get_page,url)
作业:
''''''
'''
主页:
https://movie.douban.com/top250
GET
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36
re正则:
# 电影详情页url、图片链接、电影名称、电影评分、评价人数
<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价
'''
import requests
import re
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
# 1、往豆瓣TOP250发送请求获取响应数据
response = requests.get(url, headers=headers)
# print(response.text)
# 2、通过正则解析提取数据
# 电影详情页url、图片链接、电影名称、电影评分、评价人数
movie_content_list = re.findall(
# 正则规则
'<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<p class>==<p class>(.*?)</p>.*?<spam class="inq">(.*?)</span>',
# 解析文本
response.text,
# 匹配模式
re.S)
for movie_content in movie_content_list:
# 解压赋值每一部电影
detail_url, movie_jpg, name, point,yu,jili,num = movie_content
data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num},介绍语:{yu}激励语;{jili} \n'
#print(data)
# 3、保存数据,把电影信息写入文件中
with open('li.txt', 'a', encoding='utf-8') as f:
f.write(data)


浙公网安备 33010602011771号