1 import requests
2 import time
3 from lxml import etree
4 from bs4 import BeautifulSoup
5 import re
6 from prettytable import PrettyTable
7
8
9 def get_url(page_id):
10 url = "https://movie.douban.com/top250?start=" +str(page_id)
11 headers = {
12 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36"
13 }
14 html = requests.get(url=url,headers=headers).content.decode("utf-8")
15 se = etree.HTML(html)
16 return se
17
18 def get_data(se):
19 lst=[]
20 for i in range(1,26):
21 movie_names = se.xpath(f"//ol/li[{i}]//span[@class='title'][1]/text()")[0]
22 data = se.xpath(f"//ol/li[{i}]//p[@class='']/text()")
23
24 s1=""
25 for i in data:
26 s1 += i.replace("\n","").strip()
27 ss = re.findall("[\u4E00-\u9FA5 ·0-9a-zA-Z]+",s1) # 匹配文字·空格字母数字
28 #print(ss)
29 dict1 = {"movie_names":movie_names}
30 dict1["director"] = ss[1]
31 dict1["main_person"] = ss[3]
32 try:
33 dict1["show_time"] = re.findall("\d+",ss[-3])[0]
34 except Exception as err:
35 dict1["show_time"] =""
36 dict1["contury"] = ss[-2]
37 dict1["type"] = ss[-1]
38 #print(dict1)
39 lst.append(dict1)
40 return lst
41
42
43 def show_table():
44 field_names = ("电影名称","导演","主演","年代","地区","类型")
45 table = PrettyTable(field_names=field_names)
46
47 for i in range(10):
48 se = get_url(i*25)
49 lst = get_data(se)
50 for j in lst:
51 lst1= []
52 for value in j.values():
53 lst1.append(value)
54 table.add_row(lst1)
55 print(table)
56
57 if __name__ =='__main__' :
58 show_table()