爬虫Day02

import requests
import json
if name == 'main':
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'
}
id_list = []
for page in range(1,6):
page=str(page)
data = {
'on': 'true',
'page': page,
'pageSize': '15',
'productName':'',
'conditionType': '1',
'applyname':'',
'applysn':'',
}

    json_ids = requests.post(url=url,data=data,headers=headers).json()
    for dic in json_ids['list']:
        id_list.append(dic['ID'])
# 获取企业详情数据
post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
all_data = []
for id in id_list:
    data ={
        'id': id
    }
    detail_json = requests.post(url=post_url,headers=headers,data=data).json()
    print(detail_json)
    all_data.append(detail_json)
# 持久化存储
fp = open('./huazhuangpin.json','w',encoding='utf-8')
json.dump(all_data,fp=fp,ensure_ascii=False)
print('success')

总结

requests模块

  • python中一款基于网络请求的模块

urllib模块

  • 古老,不推荐使用

如何使用

  • 指定url
  • ua伪装
  • 处理请求参数
  • 发起请求
  • 获取响应数据
  • 持久化存储

数据解析

  • 聚焦爬虫
  • 正则
  • bs4
  • xpath
posted @ 2020-10-31 22:54  JWEY  阅读(78)  评论(0)    收藏  举报