北京市政百姓信件分析实战一 (利用python爬取数据)
因为我的python版本为3.12
所以安装一些软件包命令 与之前有些许不同
pip install beautifulSoup4
pip install demjson3
pip install requests
话不多说 代码奉上
import json
import demjson3
import requests
from bs4 import BeautifulSoup
import csv
headers = {
'Host': 'www.beijing.gov.cn',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'text/json',
'X-Requested-With': 'XMLHttpRequest',
'Content-Length': '155',
'Origin': 'http://www.beijing.gov.cn',
'Connection': 'keep-alive',
'Referer': 'http://www.beijing.gov.cn/hudong/hdjl/'
}
if __name__ == "__main__":
page = 1
datas = json.dumps({})
while page < 175:
print(page)
url = f"https://www.beijing.gov.cn/hudong/hdjl/sindex/bjah-index-hdjl!replyLetterListJson.action?page.pageNo={page}&page.pageSize=6&orgtitleLength=26"
r = requests.post(url, data=datas, headers=headers)
rr = demjson3.decode(r.text);
for item in rr.get("result", []):
originalId = item.get("originalId") # 编号
letterTypeName = item.get("letterTypeName") # 信件类型
# 构建详情页URL
detail_url = f"http://www.beijing.gov.cn/hudong/hdjl/com.web.{('consult' if letterTypeName == '咨询' else 'suggest')}.{('consultDetail' if letterTypeName == '咨询' else 'suggesDetail')}.flow?originalId={originalId}"
r1 = requests.get(detail_url, headers={'user-agent': 'Mozilla/5.0'})
if r1.status_code == 200:
demo = r1.text
soup = BeautifulSoup(demo, "html.parser")
title = soup.find("strong").get_text().replace("\n", "") if soup.find("strong") else ""
fromPeople = soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"})[0].get_text().lstrip('来信人:').lstrip().rstrip() if soup.find_all("div", {"class": "col-xs-10 col-lg-3 col-sm-3 col-md-4 text-muted"}) else ""
fromTime = soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"})[0].get_text().lstrip('时间:') if soup.find_all("div", {"class": "col-xs-5 col-lg-3 col-sm-3 col-md-3 text-muted"}) else ""
problem = soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-2 text-muted mx-2"})[0].get_text().lstrip().rstrip().replace("\r", "").replace("\n", "") if soup.find_all("div", {"class", "col-xs-12 col-md-12 column p-2 text-muted mx-2"}) else ""
office = soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"})[0].get_text().replace("\n", "") if soup.find_all("div", {"class": "col-xs-9 col-sm-7 col-md-5 o-font4 my-2"}) else ""
answerTime = soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"})[0].get_text().lstrip('答复时间:') if soup.find_all("div", {"class": "col-xs-12 col-sm-3 col-md-3 my-2"}) else ""
answer = soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"})[0].get_text().lstrip().rstrip().replace("\n", "").replace("\r", "") if soup.find_all("div", {"class": "col-xs-12 col-md-12 column p-4 text-muted my-3"}) else ""
itemm = f"{originalId}|{letterTypeName}|{title}|{fromPeople}|{fromTime}|{problem}|{office}|{answerTime}|{answer}"
with open("yijian.txt", 'a', encoding='utf-8') as fp:
fp.write(itemm + '\n')
else:
print(f"Failed to retrieve details for ID: {originalId}")
page += 1

浙公网安备 33010602011771号