''''''
'''
https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
1、发送请求
2、解析数据
3、保存数据
'''
import requests
import re
#爬虫三部曲
#1、发送请求
def get_page(base_url):
response = requests.get(base_url)
return response
#2、解析文本
def parse_index(text):
res = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>'
'.*?导演:(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>'
'.*?<span class="inq">(.*?)</span>',text,re.S)
#print(res)
return res
#3、保存数据
def save_data(data):
with open('douban.txt','a',encoding='utf-8') as f:
f.write(data)
#main + 回车键
if __name__ == '__main__':
#num = 10
#base_url = 'https://movie.douban.com/top250?start={}&filter='.format(num)
num = 0
for line in range(10):
base_url = f'https://movie.douban.com/top250?start={num}&filter='
num += 25
print(base_url)
#1、发送请求,调用函数
response = get_page(base_url)
#2、解析文本
movie_list = parse_index(response.text)
#3、保存数据
#数据的格式化
for movie in movie_list:
#print(movie)
#解压赋值
#电影排名、电影url、电影名称、导演 - 主演 - 类型,电影评价,评价人数,电影简介
v_top,v_url,v_name,v_daoyan,v_point,v_num,v_desc = movie
movie_content = f'''
电影排名:{v_top}
电影url:{v_url}
电影名称:{v_name}
电影主演:{v_daoyan}
电影评分:{v_point}
评价人数:{v_num}
电影简介:{v_desc}
\n
'''
print(movie_content)
#保存数据
save_data(movie_content)