1 # coding=utf-8
2 from urllib import request
3 import requests
4 import re
5 # 断点调试
6 # class Spider():
7 # url='https://www.panda.tv/cate/lol'
8 # root_pattern='<div class="video-info">[\s\S]*?</div>'#?是贪婪,非贪婪,现在是非贪婪
9 # def __fetch_countent(self): #打开要解析的网页
10 # r=request.urlopen(Spider.url) #这里Spider.url是一个实例的意思
11 # htmls= r.read()
12 # htmls=str(htmls,encoding='utf-8')
13 # print(htmls)
14 # return htmls
15 # a=1
16 #
17 # def __analysis(self,htmls): #具体分析
18 # root_html=re.findall (Spider.root_pattern,htmls)
19 # print(root_html)
20 # a=1
21 # def go(self):
22 # htmls=self.__fetch_countent()
23 # self.__analysis(htmls)
24 #
25 #
26 # youtube=Spider()
27 # youtube.go()
28
29 class Spider():
30 url='https://www.panda.tv/cate/lol'
31 root_pattern='<div class="video-info">([\s\S]*?)</div>' #这里选取非贪婪模式
32 name_pattern='</i>([\s\S]*?)</span>'
33 number_patter='<span class="video-number">([\s\S]*?)</span>'
34
35 def __fetch_content(self): #取得_内容
36 print('1111')
37 r=requests.get(Spider.url)
38 r.enconding = "utf-8"
39 htmls=r.content.decode("utf-8")
40
41 return htmls
42 def __analysis(self,htmls): #分析内容
43 root_html=re.findall(Spider.root_pattern,htmls) #findall 需要2个参数,一个是正则内容,一个是正则对象
44 list_renqi=[]
45 for html in root_html:
46 name=re.findall(Spider.name_pattern,html)
47 number=re.findall(Spider.number_patter,html)
48 dic_renqi={'name':name,'number':number}
49 list_renqi.append(dic_renqi)
50 a=1
51 print('111')
52
53 return list_renqi
54 def __refine(self,list_renqi):#精炼列表
55 l=lambda dic_renqi:{'name':dic_renqi['name'][0].strip(),
56 'number':dic_renqi['number'][0].strip()
57 }
58 return map(l,list_renqi)
59
60 def __sort(self,list_renqi): #排序
61 list_renqi=sorted(list_renqi,key=self.__sort_seed,reverse=True) #key指定需要比较大小的元素 #reverse是排列顺序,是正序还是倒叙
62 print(list_renqi)
63 return list_renqi
64
65 def __sort_seed(self,dic_renqi): #这是给上边排序函数用的,目的是找出key的方法,用来排序
66 r=re.findall('\d*',dic_renqi['number'])# 这里是把‘万’子变成10000,用来排序
67 number=float(r[0])
68 if '万' in dic_renqi['number']:
69 number *=10000
70 return number
71
72 def __show(self,list_renqi):
73 for rank in range(0,len(list_renqi)):
74 print('rank '+str(rank+1)
75 +':'+list_renqi[rank]['name']
76 +' '+list_renqi[rank]['number'])
77 # for renqi in list_renqi:
78 # print(renqi['name']+'-------'+renqi['number'])
79
80 def go(self): #总控
81 htmls=self.__fetch_content() #获得内容
82 list_renqi=self.__analysis(htmls) #分析内容
83 list_renqi=list(self.__refine(list_renqi)) #精炼内容
84 print(type(list_renqi))
85 list_renqi=self.__sort(list_renqi) #排序
86 list_renqi=self.__show(list_renqi) #展示
87 print(list_renqi)
88
89 spider=Spider()
90 spider.go()