【爬虫】Python获取星巴克所有产品
视频只介绍了BS4的简单使用,但我想全部获取出来
其实翻看接口,直接有一个json资源提供了这些数据,但是没有分类
import re
import urllib.request
from bs4 import BeautifulSoup
import json
import datetime
# 用来提取url的正则
REGEXP1 = '\"([^\"]*)\"'
# 源地址和菜单地址
SOURCE = 'https://www.starbucks.com.cn'
API = 'https://www.starbucks.com.cn/menu/'
# 没有限制,可以直接读取
response = urllib.request.urlopen(API)
html = response.read().decode('UTF-8')
# print(html)
# 用BS4解析
soupObject = BeautifulSoup(html, 'lxml')
ulList = soupObject.select('ul[class="grid padded-3 product"]')
# print(ulList)
# 准备JSON容器
productList = []
# 先取所有ul,获取类型名称
for ul in ulList:
category = ul.select_one(selector='h3.caption')
if category is None:
continue
categoryName = category.text
print(categoryName)
# 再获取A标签,得到对应的产品和图片url
aTagList = ul.select(selector='li > a')
for aTag in aTagList:
name = aTag.text.strip()
styleStr = aTag.select_one(selector='div')['style']
styleStr = re.findall(REGEXP1, styleStr)[0]
imgUrl = SOURCE + styleStr
print(f'{name} {imgUrl}')
# 封装数据
product = {
'type': categoryName,
'name': name,
'image': imgUrl
}
productList.append(product)
# 把Python集合对象转换成JSON数据
jsonData = json.dumps(productList)
# 写入磁盘,文件名标注时间
nowTime = datetime.datetime.now()
nowTime = datetime.datetime.strftime(nowTime, '%Y年%m月%d日%H时%M分%S秒')
fp = open(file=f'星巴克产品菜单-{nowTime}.json', mode='w', encoding='UTF-8')
fp.write(jsonData)

浙公网安备 33010602011771号