豆瓣top250

 1 import requests
 2 import time
 3 from lxml import etree
 4 from bs4 import BeautifulSoup
 5 import re
 6 from prettytable import PrettyTable
 7 
 8 
 9 def get_url(page_id):
10     url = "https://movie.douban.com/top250?start=" +str(page_id)
11     headers = {
12     "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"
13     }
14     html = requests.get(url=url,headers=headers).content.decode("utf-8")
15     se = etree.HTML(html)
16     return se
17 
18 def get_data(se):
19     lst=[]
20     for i in range(1,26):
21         movie_names = se.xpath(f"//ol/li[{i}]//span[@class='title'][1]/text()")[0]
22         data = se.xpath(f"//ol/li[{i}]//p[@class='']/text()")
23 
24         s1=""
25         for i in data:
26             s1 += i.replace("\n","").strip()
27         ss = re.findall("[\u4E00-\u9FA5 ·0-9a-zA-Z]+",s1) # 匹配文字·空格字母数字
28         #print(ss)
29         dict1 = {"movie_names":movie_names}
30         dict1["director"] = ss[1]
31         dict1["main_person"] = ss[3]
32         try:
33             dict1["show_time"] = re.findall("\d+",ss[-3])[0]
34         except Exception as err:
35             dict1["show_time"] =""
36         dict1["contury"] = ss[-2]
37         dict1["type"] = ss[-1]
38         #print(dict1)
39         lst.append(dict1)
40     return lst
41 
42 
43 def show_table():
44     field_names = ("电影名称","导演","主演","年代","地区","类型")
45     table = PrettyTable(field_names=field_names)
46     
47     for i in range(10):
48         se = get_url(i*25)
49         lst = get_data(se)
50         for j in lst:
51             lst1= []
52             for value in j.values():
53                 lst1.append(value)
54             table.add_row(lst1)
55     print(table)
56 
57 if __name__ =='__main__' :
58     show_table()

 

posted @ 2021-07-23 17:49  Wskid  阅读(66)  评论(0)    收藏  举报