'''
主页:
名称、下载次数、大小、详情页地址
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B
'''
import requests
from bs4 import BeautifulSoup
# 1、发送请求
def get_page(url):
response = requests.get(url)
return response
# 解析主页
def parse_index(data):
soup = BeautifulSoup(data, 'lxml')
# 获取所有app的li标签
app_list = soup.find_all(name='li',class_="card")
for app in app_list:
app_name = app.find(name="a", class_="name").text
print(app_name)
# 下载次数
# 获取class为install-count的span标签中的文本
down_num = app.find(name='span', attrs={"class": "install-count"}).text
print(down_num)
import re
# 大小
# 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本
size = app.find(name='span', text=re.compile("\d+MB")).text
print(size)
# 详情页地址
detail_url = app.find(name='a').attrs['href']
print(detail_url)
def main():
for line in range(1, 33):
url = f"https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B"
# 1、往app接口发送请求
response = get_page(url)
print('*' * 1000)
# 反序列化为字典
data = response.json()
# 获取接口中app标签数据
app_li = data['data']['content']
# print(app_li)
# 2、解析app标签数据
parse_index(app_li)
if __name__ == '__main__':
main()