使用requests和BeautifulSoup对北京市政百姓信件进行爬取
for page in range(start_page, end_page + 1):
url = url.format(page)
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
contents = response.read()
# a标签中前一个是信件的类型,后面那个是信件编号
# 发送GET请求获取网页内容
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(contents, "html.parser")
a_tags = soup.find_all('a', onclick=True)
整个代码为(爬取的结果,根据不同类型存入了不同的txt文件中)
import requests
from bs4 import BeautifulSoup
import urllib.request
import json
import re
def huoqu():
url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow" # 替换为目标网站的URL
cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \
"sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \
" sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \
"JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \
" _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \
"_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660."
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \
" Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
headers = {"User-Agent": user_agent, "Cookie": cookie}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
contents = response.read()
# a标签中前一个是信件的类型,后面那个是信件编号
# 发送GET请求获取网页内容
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(contents, "html.parser")
return soup
def huoqu1(start_page, end_page):
url = "https://www.beijing.gov.cn/hudong/hdjl/com.web.search.mailList.flow" # 替换为目标网站的URL
cookie = "__jsluid_s=7e6494284621930c061e56e28c73fe04; arialoadData=false; __jsluid_h=babf6155559102d42f5b7f0b024bab8e;" \
"sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22188a626b5289cc-04b250d08e6751-7e56547b-1638720-188a626b529108d%22%7D;" \
" sensorsdata_is_new_user=true; bjah7webroute=83fabc8af7a68a44338f4ee9b2831e7d; BJAH7WEB1VSSTIMEID=4065C3D9D249C359ABB3E1EBF7BD9553; " \
"JSESSIONID=MDkwMjUwODgtM2E5YS00N2QzLWExYWItMmE2OWJjZTM1ZmI0; _va_ref=%5B%22%22%2C%22%22%2C1686446660%2C%22https%3A%2F%2Fcn.bing.com%2F%22%5D;" \
" _va_ses=*; route=c5730edea4c5f2b5d7a6534850353a0c; JSESSIONID=56EE4BE6A09AA5BE642BA33CE292B0D3; " \
"_va_id=d80e32c2da04fb2f.1686412321.2.1686447410.1686446660."
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" \
" Chrome/114.0.0.0 Safari/537.36 Edg/114.0.1823.43"
headers = {"User-Agent": user_agent, "Cookie": cookie}
f1 = open('G:/python/pythonProject/信件爬取/1.txt', 'a')
f2 = open('G:/python/pythonProject/信件爬取/2.txt', 'a')
f3 = open('G:/python/pythonProject/信件爬取/3.txt', 'a')
for page in range(start_page, end_page + 1):
url = url.format(page)
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
contents = response.read()
# a标签中前一个是信件的类型,后面那个是信件编号
# 发送GET请求获取网页内容
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(contents, "html.parser")
a_tags = soup.find_all('a', onclick=True)
for element in a_tags:
onclick_value = element["onclick"]
match = re.search(r"letterdetail\('(\d+)', '([^']+)'\)", onclick_value)
if match:
onclick_param1 = match.group(1)
# print(type(onclick_param1))
onclick_param2 = match.group(2)
if onclick_param1 == '1':
f1.write(onclick_param2+'\n')
if onclick_param1 == '2':
f2.write(onclick_param2+'\n')
if onclick_param1 == '3':
f3.write(onclick_param2+'\n')
print(f"onclick param 1: {onclick_param1}, onclick param 2: {onclick_param2}")
f1.flush()
f2.flush()
f3.flush()
f1.close()
f2.close()
f3.close()
if __name__ == '__main__':
huoqu1(1, 173)


浙公网安备 33010602011771号