python实现信件详细信息爬取

以下是任务要求:
(1) 采集北京市政百姓信件内容;
(2) 编写 MapReduce 程序清洗信件内容数据;
(3) 利用 HiveSql 语句离线分析信件内容数据;
(4) 利用 Sqoop 导出 Hive 分析数据到 MySQL 库;
(5) 开发 JavaWeb+ECharts 完成信件数据图表展示过程。
现在我做到了python实现数据爬取,现在卡在了(4),我暂时不知道Sqoop是什么,后面的部分等我完成再说,以下是我当前的源码:

batch_letter_crawler.py

import requests
from bs4 import BeautifulSoup
import re
import time
import csv

def get_page_content(url, page_no):
"""获取指定页码的网页内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 构建表单数据
data = {
'page.pageNo': page_no,
'page.pageSize': '6'
}
try:
response = requests.post(url, headers=headers, data=data)
response.encoding = 'utf-8'
return response.text
except Exception as e:
print(f"获取第{page_no}页内容失败: {e}")
return None

def parse_letter_list(html_content):
"""解析信件列表页面,提取信件基本信息和originalId"""
letters = []
soup = BeautifulSoup(html_content, 'html.parser')

# 查找所有信件条目
letter_items = soup.select('div.row.clearfix.my-2.list-group.o-border-bottom2.p-3')

for item in letter_items:
    try:
        # 提取信件名称和originalId
        letter_name_div = item.select_one('div.o-font3.col-md-7.pb-3 a')
        if letter_name_div:
            letter_name = letter_name_div.text.strip()
            # 提取originalId
            onclick_attr = letter_name_div.get('onclick', '')
            match = re.search(r"letterdetail\('\d+'\s*,\s*'([^']+)'\)", onclick_attr)
            if match:
                original_id = match.group(1)
            else:
                original_id = ""
        else:
            letter_name = "未找到名称"
            original_id = ""
        
        # 提取来信时间和回复时间
        time_divs = item.select('div.col-md-5 div.o-font2')
        来信_time = ""
        回复_time = ""
        
        for time_div in time_divs:
            text = time_div.text.strip()
            if '来信时间:' in text:
                来信_time = text.replace('来信时间:', '').strip()
            elif '回复时间:' in text:
                回复_time = text.replace('回复时间:', '').strip()
        
        # 添加到列表
        letters.append({
            '名称': letter_name,
            '来信时间': 来信_time,
            '回复时间': 回复_time,
            'originalId': original_id
        })
    except Exception as e:
        print(f"解析信件列表失败: {e}")
        continue

return letters

def get_letter_detail(original_id):
"""获取单个信件的详细信息"""
if not original_id:
return None

url = f'https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId={original_id}'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

try:
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    return response.text
except Exception as e:
    print(f"获取信件 {original_id} 详情失败: {e}")
    return None

def parse_letter_detail(html_content):
"""解析信件详情页面的内容"""
soup = BeautifulSoup(html_content, 'html.parser')
letter_detail = {}

try:
    # 提取信件内容
    content_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 div.col-xs-12.col-md-12.column.p-2.text-muted.text-format.mx-2')
    if content_div:
        letter_detail['信件内容'] = content_div.text.strip()
    else:
        letter_detail['信件内容'] = "未找到信件内容"
    
    # 提取回复信息
    reply_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 + div.row.clearfix.my-5.o-border.p-2')
    if reply_div:
        # 回复单位
        reply_unit_div = reply_div.select_one('div.col-xs-9.col-sm-7.col-md-5.o-font4.my-2 strong')
        if reply_unit_div:
            letter_detail['回复单位'] = reply_unit_div.text.strip()
        else:
            letter_detail['回复单位'] = "未找到回复单位"
        
        # 回复时间
        reply_time_div = reply_div.select_one('div.col-xs-12.col-sm-12.col-md-12.nmx-2.my-2.text-muted')
        if reply_time_div:
            letter_detail['回复时间'] = reply_time_div.text.replace('答复时间:', '').strip()
        else:
            letter_detail['回复时间'] = "未找到回复时间"
        
        # 回复内容
        reply_content_div = reply_div.select_one('div.col-xs-12.col-md-12.column.p-2.text-muted.text-format.mx-2')
        if reply_content_div:
            letter_detail['回复内容'] = reply_content_div.text.strip()
        else:
            letter_detail['回复内容'] = "未找到回复内容"

except Exception as e:
    print(f"解析信件详情失败: {e}")

return letter_detail

def output_letters(all_letters, output_file='all_letters_detail.csv'):
"""将所有信件详细信息输出到CSV文件"""
# 定义CSV列名
fieldnames = ['序号', '名称', '来信时间', '回复时间', 'originalId', '信件内容', '回复单位', '回复内容']

with open(output_file, 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # 写入表头
    writer.writeheader()
    
    # 写入数据
    for i, letter in enumerate(all_letters, 1):
        # 清洗数据:去除多余的空格、换行符和制表符
        def clean_text(text):
            if text:
                # 去除多余的空格和换行符
                text = ' '.join(text.split())
                return text
            return ''
        
        writer.writerow({
            '序号': i,
            '名称': clean_text(letter.get('名称', '')),
            '来信时间': clean_text(letter.get('来信时间', '')),
            '回复时间': clean_text(letter.get('回复时间', '')),
            'originalId': clean_text(letter.get('originalId', '')),
            '信件内容': clean_text(letter.get('信件内容', '')),
            '回复单位': clean_text(letter.get('回复单位', '')),
            '回复内容': clean_text(letter.get('回复内容', ''))
        })

print(f"所有信件详细信息已输出到 {output_file}")
print(f"共输出 {len(all_letters)} 条记录")

def main():
"""主函数"""
list_url = 'https://www.beijing.gov.cn/hudong/hdjl/sindex/hdjl-xjxd.html'
all_letters = []
total_pages = 44 # 总页数为44

print("开始爬取信件列表...")

# 第一步:爬取所有信件列表信息
for page_no in range(1, total_pages + 1):
    print(f"正在爬取第 {page_no}/{total_pages} 页列表...")
    html_content = get_page_content(list_url, page_no)
    if html_content:
        letters = parse_letter_list(html_content)
        all_letters.extend(letters)

print(f"爬取完成,共获取到 {len(all_letters)} 封信件列表信息")

# 第二步:爬取每个信件的详细信息
print("\n开始爬取信件详细信息...")
for i, letter in enumerate(all_letters, 1):
    original_id = letter.get('originalId', '')
    if original_id:
        print(f"正在爬取第 {i}/{len(all_letters)} 封信件的详细信息 (ID: {original_id})...")
        detail_html = get_letter_detail(original_id)
        if detail_html:
            detail_info = parse_letter_detail(detail_html)
            # 合并详细信息到原有字典
            letter.update(detail_info)
        # 避免请求过快
        time.sleep(1)
    else:
        print(f"第 {i}/{len(all_letters)} 封信件没有originalId,跳过...")

# 第三步:输出结果
output_letters(all_letters)
print("\n所有信件信息爬取完成!")

if name == "main":
main()

letter_crawler.py

import requests
from bs4 import BeautifulSoup
import re

def get_page_content(url, page_no):
"""获取指定页码的网页内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 构建表单数据
data = {
'page.pageNo': page_no,
'page.pageSize': '6'
}
try:
response = requests.post(url, headers=headers, data=data)
response.encoding = 'utf-8'
return response.text
except Exception as e:
print(f"获取第{page_no}页内容失败: {e}")
return None

def parse_letters(html_content):
"""解析HTML内容,提取信件信息"""
letters = []
soup = BeautifulSoup(html_content, 'html.parser')

# 查找所有信件条目
letter_items = soup.select('div.row.clearfix.my-2.list-group.o-border-bottom2.p-3')

for item in letter_items:
    try:
        # 提取信件名称和originalId
        letter_name_div = item.select_one('div.o-font3.col-md-7.pb-3 a')
        if letter_name_div:
            letter_name = letter_name_div.text.strip()
            # 提取originalId
            onclick_attr = letter_name_div.get('onclick', '')
            import re
            match = re.search(r"letterdetail\('\d+'\s*,\s*'([^']+)'\)", onclick_attr)
            if match:
                original_id = match.group(1)
            else:
                original_id = ""
        else:
            letter_name = "未找到名称"
            original_id = ""
        
        # 提取来信时间和回复时间
        time_divs = item.select('div.col-md-5 div.o-font2')
        来信_time = ""
        回复_time = ""
        
        for time_div in time_divs:
            text = time_div.text.strip()
            if '来信时间:' in text:
                来信_time = text.replace('来信时间:', '').strip()
            elif '回复时间:' in text:
                回复_time = text.replace('回复时间:', '').strip()
        
        # 添加到列表
        letters.append({
            '名称': letter_name,
            '来信时间': 来信_time,
            '回复时间': 回复_time,
            'originalId': original_id
        })
    except Exception as e:
        print(f"解析信件失败: {e}")
        continue

return letters

def output_letters(letters, output_file='letters_info.txt'):
"""将信件信息输出到文本文件"""
with open(output_file, 'w', encoding='utf-8') as f:
for i, letter in enumerate(letters, 1):
f.write(f"信件 {i}:\n")
f.write(f" 名称: {letter['名称']}\n")
f.write(f" 来信时间: {letter['来信时间']}\n")
f.write(f" 回复时间: {letter['回复时间']}\n")
f.write("-" * 50 + "\n")
print(f"信件信息已输出到 {output_file}")

def main():
"""主函数"""
url = 'https://www.beijing.gov.cn/hudong/hdjl/sindex/hdjl-xjxd.html'
all_letters = []
total_pages = 44 # 从HTML中获取到总页数为44

print("开始爬取信件信息...")

for page_no in range(1, total_pages + 1):
    print(f"正在爬取第 {page_no}/{total_pages} 页...")
    html_content = get_page_content(url, page_no)
    if html_content:
        letters = parse_letters(html_content)
        all_letters.extend(letters)

print(f"爬取完成,共获取到 {len(all_letters)} 封信件信息")
output_letters(all_letters)

if name == "main":
main()

letter_detail_crawler.py

import requests
from bs4 import BeautifulSoup

def get_letter_detail(url):
"""获取信件详情页面的内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
except Exception as e:
print(f"获取信件详情失败: {e}")
return None

def parse_letter_detail(html_content):
"""解析信件详情页面的内容"""
soup = BeautifulSoup(html_content, 'html.parser')
letter_detail = {}

try:
    # 提取信件标题
    title_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 div strong')
    if title_div:
        letter_detail['标题'] = title_div.text.strip()
    else:
        letter_detail['标题'] = "未找到标题"
    
    # 提取来信人信息
    letter_info_div = soup.select_one('div.col-xs-12.col-md-12.column.my-3')
    if letter_info_div:
        # 来信人
        sender_div = letter_info_div.select_one('div.col-xs-10.col-lg-3.col-sm-3.col-md-4.text-muted')
        if sender_div:
            letter_detail['来信人'] = sender_div.text.replace('来信人:', '').strip()
        else:
            letter_detail['来信人'] = "未找到来信人"
        
        # 来信时间
        time_div = letter_info_div.select_one('div.col-xs-5.col-lg-3.col-sm-3.col-md-3.text-muted')
        if time_div:
            letter_detail['来信时间'] = time_div.text.replace('时间:', '').strip()
        else:
            letter_detail['来信时间'] = "未找到来信时间"
    
    # 提取信件内容
    content_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 div.col-xs-12.col-md-12.column.p-2.text-muted.text-format.mx-2')
    if content_div:
        letter_detail['信件内容'] = content_div.text.strip()
    else:
        letter_detail['信件内容'] = "未找到信件内容"
    
    # 提取回复信息
    reply_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 + div.row.clearfix.my-5.o-border.p-2')
    if reply_div:
        # 回复单位
        reply_unit_div = reply_div.select_one('div.col-xs-9.col-sm-7.col-md-5.o-font4.my-2 strong')
        if reply_unit_div:
            letter_detail['回复单位'] = reply_unit_div.text.strip()
        else:
            letter_detail['回复单位'] = "未找到回复单位"
        
        # 回复时间
        reply_time_div = reply_div.select_one('div.col-xs-12.col-sm-12.col-md-12.nmx-2.my-2.text-muted')
        if reply_time_div:
            letter_detail['回复时间'] = reply_time_div.text.replace('答复时间:', '').strip()
        else:
            letter_detail['回复时间'] = "未找到回复时间"
        
        # 回复内容
        reply_content_div = reply_div.select_one('div.col-xs-12.col-md-12.column.p-2.text-muted.text-format.mx-2')
        if reply_content_div:
            letter_detail['回复内容'] = reply_content_div.text.strip()
        else:
            letter_detail['回复内容'] = "未找到回复内容"

except Exception as e:
    print(f"解析信件详情失败: {e}")

return letter_detail

def output_letter_detail(letter_detail, output_file='letter_detail.txt'):
"""将信件详情输出到文本文件"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write("信件详情信息\n")
f.write("=" * 50 + "\n")
for key, value in letter_detail.items():
f.write(f"{key}: {value}\n")
print(f"信件详情已输出到 {output_file}")

def main():
"""主函数"""
url = 'https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=AH25122401333'

print("开始爬取信件详情...")

# 获取页面内容
html_content = get_letter_detail(url)
if html_content:
    # 解析内容
    letter_detail = parse_letter_detail(html_content)
    
    # 输出结果
    if letter_detail:
        print("爬取成功,信件详情:")
        for key, value in letter_detail.items():
            print(f"{key}: {value}")
        output_letter_detail(letter_detail)
    else:
        print("未找到信件详情")
else:
    print("获取页面失败")

if name == "main":
main()

posted @ 2026-01-17 22:49  离璨霂  阅读(4)  评论(0)    收藏  举报