爬虫Day02
import requests
import json
if name == 'main':
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
id_list = []
for page in range(1,6):
page=str(page)
data = {
'on': 'true',
'page': page,
'pageSize': '15',
'productName':'',
'conditionType': '1',
'applyname':'',
'applysn':'',
}
json_ids = requests.post(url=url,data=data,headers=headers).json()
for dic in json_ids['list']:
id_list.append(dic['ID'])
# 获取企业详情数据
post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
all_data = []
for id in id_list:
data ={
'id': id
}
detail_json = requests.post(url=post_url,headers=headers,data=data).json()
print(detail_json)
all_data.append(detail_json)
# 持久化存储
fp = open('./huazhuangpin.json','w',encoding='utf-8')
json.dump(all_data,fp=fp,ensure_ascii=False)
print('success')
总结
requests模块
- python中一款基于网络请求的模块
urllib模块
- 古老,不推荐使用
如何使用
- 指定url
- ua伪装
- 处理请求参数
- 发起请求
- 获取响应数据
- 持久化存储
数据解析
- 聚焦爬虫
- 正则
- bs4
- xpath

浙公网安备 33010602011771号