爬取豌豆荚app数据
方法一:requests+bs4
import requests from bs4 import BeautifulSoup import re # 1.发送请求 def get_page(url): response = requests.get(url) return response # 2.解析数据 def parse_index(code): app_info = code.find_all(name='li', class_='card') return app_info # 3.保存数据 def save_data(app): app_name = app.h2.a.text detail_url = app.h2.a.attrs['href'] download_num = app.find(class_='install-count').text app_size = app.find(class_='meta').find_all(name='span')[2].attrs['title'] data = f""" app名称:{app_name} 详情页url:{detail_url} 下载人数:{download_num} app大小:{app_size} \n """ print(data) with open('豌豆荚.txt','a',encoding='utf-8') as f: f.write(data) if __name__ == '__main__': url = 'https://www.wandoujia.com/category/6001' index_response = get_page(url) # print(url) index_code = BeautifulSoup(index_response.text,'lxml') app_list = parse_index(index_code) for app in app_list: save_data(app) print('写入成功......')
方法二:selenium
from selenium import webdriver import time driver = webdriver.Chrome() try: driver.implicitly_wait(20) driver.get('https://www.wandoujia.com/category/6001') time.sleep(5) js_code = ''' window.scrollTo(0,5000) ''' driver.execute_script(js_code) time.sleep(5) # 信息加载,等待5s app_list = driver.find_elements_by_class_name('card') for app in app_list: # app名称 app_name = app.find_element_by_css_selector('.app-title-h2 a').text # 详情页url detail_url = app.find_element_by_css_selector('.app-title-h2 a').get_attribute('href') # 下载人数和app大小 download_num_size= app.find_element_by_class_name('meta').text #这两个信息都在同一个div里,app大小又没有class和id定位就一起输出,然后再用字符串提取数据方法分别提取 app_content = f""" ======================================== app名字:{app_name} 详情页url:{detail_url} 下载人数:{download_num_size[:8]} app大小:{download_num_size[11:]} ======================================== \n """ print(app_content) with open('豌豆荚app.txt','a',encoding='utf-8') as f: f.write(app_content) time.sleep(3) finally: driver.close()

浙公网安备 33010602011771号