1 '''
2 主页:
3 图标地址、下载次数、大小、详情页地址
4
5 详情页:
6 游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、
7 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
8
9 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
10
11 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
12
13 32
14 '''
15 import requests
16 from bs4 import BeautifulSoup
17 from pymongo import MongoClient
18 import re
19
20 #连接mongoDB数据库
21 client=MongoClient('localhost',27017)
22 #主页信息
23 index_col=client['wandoujia']['index']
24 #详情页信息
25 detail_col=client['wandoujia']['detail']
26
27 # 1、发送请求
28 def get_page(url):
29 response = requests.get(url)
30 return response
31
32 # 2、开始解析
33 # 解析详情页
34 def parse_detail(text):
35 soup = BeautifulSoup(text, 'lxml')
36 # print(soup)
37
38 # app名称
39 try:
40 name = soup.find(name="span", attrs={"class": "title"}).text
41 except Exception:
42 name=None
43 # print(name)
44
45 # 好评率
46 try:
47 love = soup.find(name='span', attrs={"class": "love"}).text
48 except Exception:
49 love = None
50 # print(love)
51
52 # 评论数
53 try:
54 commit_num = soup.find(name='a', attrs={"class": "comment-open"}).text
55 except Exception:
56 commit_num = None
57 # print(commit_num)
58
59 # 小编点评
60 try:
61 commit_content = soup.find(name='div', attrs={"class": "con"}).text
62 except Exception:
63 commit_content = None
64 # print(commit_content)
65
66 # app下载链接
67 try:
68 download_url = soup.find(name='a', attrs={"class": "normal-dl-btn"}).attrs['href']
69 except Exception:
70 download_url = None
71 # print(download_url)
72
73 print('''
74 ============= tank ==============
75 app名称:{name}
76 好评率: {love}
77 评论数: {commit_num}
78 小编点评: {commit_content}
79 app下载链接: {download_url}
80 ============= end ==============
81 '''.format(name='name',love='love',commit_num='commit_num',commit_content='commit_content',download_url='download_url')
82 )
83
84 #判断所有数据都存在,正常赋值
85 if name and love and commit_num and commit_content and download_url:
86 detail_data={
87 'name':name,
88 'love':love,
89 'commit_num':commit_num,
90 'commit_content':commit_content,
91 'download_url':download_url,
92 }
93
94 #若love没有值,则设置为 没人点赞,很惨
95 if not love:
96 detail_data = {
97 'name': name,
98 'love': "没人点赞,很惨",
99 'commit_num':commit_num,
100 'commit_content':commit_content,
101 'download_url':download_url
102 }
103
104 # 若download_url没有值,则设置为 没有安装包
105 if not love:
106 detail_data = {
107 'name':name,
108 'love':love,
109 'commit_num': commit_num,
110 'commit_content': commit_content,
111 'download_url': "没有安装包",
112 }
113
114 #插入详情页数据
115 detail_col.insert(detail_data)
116 print('{name}app数据插入成功!')
117
118
119
120
121 # 解析主页
122 def parse_index(data):
123 soup = BeautifulSoup(data, 'lxml')
124
125 # 获取所有app的li标签
126 app_list = soup.find_all(name='li', attrs={"class": "card"})
127 for app in app_list:
128 # print(app)
129 # print('tank' * 1000)
130 # print('tank *' * 1000)
131 # print(app)
132 # 图标地址
133 # 获取第一个img标签中的data-original属性
134 img = app.find(name='img').attrs['data-original']
135 print(img)
136
137 # 下载次数
138 # 获取class为install-count的span标签中的文本
139 down_num = app.find(name='span', attrs={"class": "install-count"}).text
140 print(down_num)
141
142
143 # 大小
144 # 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本
145 size = soup.find(name='span', text=re.compile("\d+MB")).text
146 print(size)
147
148 # 详情页地址
149 # 获取class为detail-check-btn的a标签中的href属性
150 # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href']
151 # print(detail_url)
152
153 # 详情页地址
154 detail_url = app.find(name='a').attrs['href']
155 print(detail_url)
156
157 # 拼接数据
158 index_data = {
159 'img': img,
160 'down_num': down_num,
161 'size': size,
162 'detail_url': detail_url,
163 }
164
165 # 插入数据
166 index_col.insert(index_data)
167 print('主页数据插入成功!')
168
169 # 3、往app详情页发送请求
170 response = get_page(detail_url)
171
172 # 4、解析app详情页
173 parse_detail(response.text)
174
175
176 def main():
177 for line in range(1, 33):
178 url = "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
179
180 # 1、往app接口发送请求
181 response = get_page(url)
182 # print(response.text)
183 print('*' * 1000)
184 # 反序列化为字典
185 data = response.json()
186
187 # 获取接口中app标签数据
188 app_li = data['data']['content']
189 # print(app_li)
190 # 2、解析app标签数据
191 parse_index(app_li)
192
193 #执行完所有函数关闭mongoDB客户端
194 client.close()
195
196 if __name__ == '__main__':
197 main()