'''
爬取豌豆荚app数据
spider_method:
requests + bs4
or
selenium
url:
https://www.wandoujia.com/category/6001
data:
名称、详情页url、下载人数、app大小
app_name, detail_url, download_num, app_size
'''
from bs4 import BeautifulSoup
# 爬虫三部曲
# 1.发送请求
import requests
def get_page(url):
reponse = requests.get(url)
# print(reponse.text)
return reponse
# 2.解析数据
import re
def parse_index(html):
game_list = re.findall('<h2 class="app-title-h2"><a href="(.*?)" title="(.*?)" class="name">.*?</a>.*?<span class="install-count">(.*?)万人安装</span> <span class="dot">・</span> <span title="(.*?)MB">.*?MB</span>',
html,
re.S)
print(game_list)
return game_list
# 3.保存数据
def save_data(game):
app_name, detail_url, download_num, app_size = game
data = f'''
=========欢迎=========
游戏名称:{app_name}
详情页url:{detail_url}
下载人数:{download_num}万人
app大小:{app_size}MB
=========再见=========
\n
'''
print(data)
with open('wandoujia.txt', 'a', encoding='utf-8') as f:
f.write(data)
if __name__ == '__main__':
# 拼接主页
url = f'https://www.wandoujia.com/category/6001'
print(url)
# 1.往主页发送请求
index_res = get_page(url)
# 2.解析主页获取游戏信息
game_list = parse_index(index_res.text)
for game in game_list:
# 3.保存数据
# print(game_list)
save_data(game)