import json
import os.path
import time
from jsonpath import *
# import jsonpath as jsonpath
import pandas as pd
import requests
# url = "http://www.whggzy.com/front/search/category"
def get_resp(url,name,i):
headers = {
"Referer": "http://www.whggzy.com/PoliciesAndRegulations/index.html?utm=sites_group_front.26a79a93.0.0.715108e02e0e11ee837be5c5ca3fd993",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
"Accept": "*/*",
"Content-Type": "application/json",
"X-Requested-With": "XMLHttpRequest"
}
data = {
"utm":"sites_group_front.26a79a93.0.0.715108e02e0e11ee837be5c5ca3fd993",
"categoryCode":f"{name}",
"pageSize":15,
"pageNo":f"{i}"
}
# json = data,json传参就算将参数转化为json格式进行传递的
resp = requests.post(url, headers=headers, json=data).json()
return resp
def save_json(content):
data = json.dumps(content)
with open("wh_data.json",'w',encoding="utf-8") as w:
w.write(data)
def get_data(data_list,csv_path,i):
base_url = 'http://www.whggzy.com/'
pathName = ''
for data in data_list:
pathName = jsonpath(data,'$..pathName')[0] if jsonpath(data,'$..pathName') else None
title = jsonpath(data,'$..title')[0] if jsonpath(data,'$..title') else None
publishDate = jsonpath(data,'$..publishDate')[0] if jsonpath(data,'$..publishDate') else None
date = time.strftime('%Y-%m-%d',time.localtime(publishDate / 1000))
attachmentUrl = jsonpath(data,'$..attachmentUrl')[0] if jsonpath(data,'$..attachmentUrl') else None
url = base_url + jsonpath(data,'$..url')[0] if jsonpath(data,'$..url') else None
csv_list = [pathName,title,date,attachmentUrl,url]
save_csv(csv_list,csv_path)
print(f'政策法规-->>{pathName}-->> 第{i}页下爬取完毕 !!!')
def judge_csv_file():
# 当前脚本文件的绝对路径,_file_代表的是appLogger 这个文件
current_path = os.path.abspath(__file__)
# 定义一个类属性,保存的文件名称
csv_path = os.path.join(os.path.abspath(os.path.dirname(current_path)),
'wh_data.csv')
print(csv_path)
if not os.path.exists(csv_path):
head_list = ['项目','标题','日期','附件网址','内容地址']
tb_head = ",".join(head_list) + '\n'
with open(csv_path,'w',encoding="utf-8") as wf:
wf.write(tb_head)
return csv_path
def save_csv(data_list,csv_path):
data = pd.DataFrame(data=[data_list])
# 追加数据,mode = 'a',表示追加,index=False 表示不给每行数据加索引序号,header=False 表示不加标题
data.to_csv(csv_path,mode='a',index=False,header=False,encoding='utf-8')
def run(url):
csv_path = judge_csv_file()
name_list = ["GovernmentProcurement","BidAndEngineerConstruction","LandAndMineralRightsTransaction",
"TransactionOfPropertyRights","TransactionOfPublicResources"]
for name in name_list:
i = 1
while True:
content = get_resp(url,name,i)
save_json(content)
data_list = content['hits']['hits']
if data_list:
get_data(data_list,csv_path,i)
else:
break
i += 1
if __name__ == '__main__':
url = "http://www.whggzy.com/front/search/category"
run(url)