1 from lxml import etree
2 import requests
3
4
5 baseurl = 'https://www.dytt8.net'
6 headers = {
7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
8 'Referer': 'https://www.dytt8.net/html/gndy/dyzz/index.html'
9 }
10 def agent(ur):
11 resp = requests.get(ur,headers = headers)
12 # parse = etree.HTMLParser()
13 text = resp.text
14 html = etree.HTML(text)
15 # a = etree.tostring(html, encoding='utf-8').decode('utf-8')
16 return html
17
18 def movie_url_list(html):
19 url = html.xpath("//table[@class='tbspan']//a/@href")
20 return url
21
22 def parse_info(info,rule):
23 return info.replace(rule,'').strip()
24
25 def xiangqingye(url):
26 resp = requests.get(url, headers=headers)
27 text = resp.content.decode('gbk')
28 html = etree.HTML(text)
29 a = html.xpath('//div[@id="Zoom"]//text()')
30 movie = {}
31 for info in a:
32 if info.startswith("◎片 名"):
33 info = parse_info(info, '◎片 名')
34 movie['pianming'] = info
35 if info.startswith("◎年 代"):
36 info = parse_info(info, '◎年 代')
37 movie['niandai'] = info
38 if info.startswith("◎产 地"):
39 info = parse_info(info, '◎产 地')
40 movie['chandi'] = info
41 if info.startswith("◎类 别"):
42 info = parse_info(info, '◎类 别')
43 movie['leixing'] = info
44 if info.startswith("◎上映日期"):
45 info = parse_info(info, '◎上映日期')
46 movie['shangyingshijian'] = info
47 if info.startswith("◎豆瓣评分"):
48 info = parse_info(info, '◎豆瓣评分')
49 movie['doubanpingfen'] = info
50 if info.startswith("◎片 长"):
51 info = parse_info(info, '◎片 长')
52 movie['pianchang'] = info
53 if info.startswith("◎标 签"):
54 info = parse_info(info, '◎标 签')
55 movie['biaoqian'] = info
56 return movie
57
58 def alldata():
59 srt1 = 'https://www.dytt8.net/html/gndy/dyzz/list_23_'
60 str2 = '.html'
61 movies = []
62 for i in range(1,2):
63 url = srt1+str(i)+str2
64 ura = agent(url)
65 b = movie_url_list(ura)
66 for z in b:
67 c = baseurl + z
68 movielist = xiangqingye(c)
69 movies.append(movielist)
70 return movies
71 if __name__ == '__main__':
72 print(alldata())