1 import requests
2 # 导入lxml使用xpath提取数据
3 from lxml import etree
4 def douban_movies(m_type,nums):
5 '''
6 豆瓣电影排行榜爬取
7 '''
8
9 url = "https://movie.douban.com/j/chart/top_list?"+m_type+"&interval_id=100%3A90&action=&start=0&limit="+nums
10 headers = {
11 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
12 }
13 response = requests.get(url=url,headers=headers)
14 datas = response.json()
15 for data in datas:
16 movies_info = {}
17 movies_info ['image'] = data['cover_url']
18 movies_info ['types'] = data['types']
19 movies_info ['regions'] = data['regions']
20 movies_info ['title'] = data['title']
21 movies_info ['url'] = data['url']
22 movies_info ['release_date'] = data['release_date']
23 movies_info ['score'] = data['score']
24 movies_info ['actors'] = data['actors']
25
26 with open('./'+key_word+'豆瓣电影分类排行榜爬取.csv','a+',encoding='utf-8')as f:
27 f.writelines(str(movies_info))
28
29
30 def get_type():
31
32 movies_type = {}
33 url = 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action='
34 headers = {
35 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
36 }
37 response = requests.get(url=url, headers=headers)
38 douban_html = etree.HTML(response.text)
39 datas = douban_html.xpath("//div[@class='article']/div[2]/div[@class='types']/span")
40
41 for href in datas:
42 info = href.xpath(".//@href")[0].split('&')[1]
43 name = href.xpath(".//a")[0].text
44 movies_type[name] = info
45 # print(info)
46
47 return movies_type
48
49 if __name__ == '__main__':
50 '''
51 分析
52 页面分析一次显示20条信息
53 json数据中的请求url显示为:
54 https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=0&limit=20
55 https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20
56 https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=40&limit=20
57 start 为起始点
58 limit 为显示信息
59 type 为剧情类型
60 '''
61 key_word = input('请输入查询分类排行榜>>')
62 nums = input('请输入查询数据数量>>')
63 # 获取分类
64 movies_type = get_type()
65 if key_word in movies_type.keys():
66 # 执行爬取
67 m_type = movies_type[key_word]
68 douban_movies(m_type,nums)
69 pass
70 else:
71 print('输入电影分类不存在!!!')