python作业6月14日
# 爬虫原理 提取有用数据 保存
# 互联网
# 浏览器 发送http请求
# 梨视频:分析视频文件 保存到本地
# 今日内容
# request模块的详细使用
# selenium模块
# 1 请求库
# 2 解析库
# 3 存储库
import requests
import re
# response=requests.get(url='https://www.pearvideo.com/video_1566066')
# <a href="video_1461493" class="actplay openapp" target="_blank">
# <div class="video-main">
# <img class="img" src="https://image.pearvideo.com/cont/20181023/cont-1461493-11648067.png" alt="离婚后需要帮前妻还债吗?">
# <div class="vdo-time">02:35</div></div>
# <div class="vdo-summary">
# <p class="vdo-tt">离婚后需要帮前妻还债吗?</p>
# <div class="none vdo-open-tips"></div>
# </div>
# </a>
# re.findall('正则表达式','jiexiewenben','zhengzemoshi')
# print(response.status_code)
# print(response.text)
# res=re.findall('<a href="video_(.*?)"',response.text,re.S)
# # 。当前位置 *所有 ?找到就停下来
# print(res)
# for m_id in res:
# detail_url='https://www.pearvideo.com/video_'+m_id
# print(detail_url)
# 获取视频详情页
import uuid
# 1 发送请求
def get_page(url):
response=requests.get(url)
return response
# 2 解析数据
def parse_index(text):
res=re.findall('<a href="video_(.*?)"',text,re.S)
detail_url_list=[]
for m_id in res:
detail_url='https://www.pearvideo.com/video_'+m_id
detail_url_list.append(detail_url)
return detail_url_list
def parse_detail(text):
movie_url=re.findall('srcUrl="(.*?)"',text,re.S)[0]
return movie_url
# 3 保存数据
def save_movie(movie_url):
response=requests.get(movie_url)
with open(f'{uuid.uuid4()}.mp4','wb') as f:
f.write(response.content)
f.flush()
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import pool
if __name__ == "__main__":
# ThreadPoolExecutor
# index_res=get_page(url='https://www.pearvideo.com/')
# detail_url_list=parse_index(index_res.text)
# for detail_url in detail_url_list:
# detail_res =get_page(url=detail_url)
# movie_url=parse_detail(detail_res.text)
# print(movie_url)
# save_movie(movie_url)
# 导入线程池模块
url='https://www.pearvideo.com/'
pool.submit(get_page,url)
作业:
'''''' ''' 主页: https://movie.douban.com/top250 GET User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36 re正则: # 电影详情页url、图片链接、电影名称、电影评分、评价人数 <div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价 ''' import requests import re url = 'https://movie.douban.com/top250' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' } # 1、往豆瓣TOP250发送请求获取响应数据 response = requests.get(url, headers=headers) # print(response.text) # 2、通过正则解析提取数据 # 电影详情页url、图片链接、电影名称、电影评分、评价人数 movie_content_list = re.findall( # 正则规则 '<div class="item">.*?href="(.*?)">.*?src="(.*?)".*?<span class="title">(.*?)</span>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价.*?<p class>==<p class>(.*?)</p>.*?<spam class="inq">(.*?)</span>', # 解析文本 response.text, # 匹配模式 re.S) for movie_content in movie_content_list: # 解压赋值每一部电影 detail_url, movie_jpg, name, point,yu,jili,num = movie_content data = f'电影名称:{name}, 详情页url:{detail_url}, 图片url:{movie_jpg}, 评分: {point}, 评价人数: {num},介绍语:{yu}激励语;{jili} \n' #print(data) # 3、保存数据,把电影信息写入文件中 with open('li.txt', 'a', encoding='utf-8') as f: f.write(data)
【推荐】博客园的心动:当一群程序员决定开源共建一个真诚相亲平台
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】Flutter适配HarmonyOS 5知识地图,实战解析+高频避坑指南
【推荐】开源 Linux 服务器运维管理面板 1Panel V2 版本正式发布
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步