北京信箱实践
- 采集北京市政百姓信件内容;
- 编写MapReduce程序清洗信件内容数据;
- 利用HiveSql语句离线分析信件内容数据;
- 利用Sqoop导出Hive分析数据到MySQL库;
- 开发JavaWeb+ECharts 完成信件数据图表展示过程。
数据清洗以及分析信件内容
今天继续完成接下来的部分
爬取的数据

目前这数据抖挺好的,我觉得不用进行清洗,直接导入
首先创建表
CREATE TABLE letters (
originalId STRING,
letterTypeName STRING,
letterTitle STRING,
writeDate STRING,
replyDate STRING,
href STRING,
come_content STRING,
replay_content STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;
通过finalshell将文件上传到虚拟机,之后进行加载数据
LOAD DATA LOCAL INPATH '/export/data/letters.csv' OVERWRITE INTO TABLE letters;
-- 删除标题行
INSERT OVERWRITE TABLE letters
SELECT * FROM letters
WHERE originalId != 'originalId';

删除标题行后

数据处理
-- 创建新表存储清洗后的数据
CREATE TABLE letters_cleaned (
originalId STRING,
letterTypeName STRING,
letterTitle STRING,
writeDate DATE, -- 转换为DATE类型
replyDate DATE, -- 转换为DATE类型
href STRING,
come_content STRING,
replay_content STRING
)
STORED AS TEXTFILE;
-- 插入清洗后的数据
INSERT OVERWRITE TABLE letters_cleaned
SELECT
trim(originalId), -- 去除空格
trim(letterTypeName),
trim(letterTitle),
-- 转换日期格式
from_unixtime(unix_timestamp(writeDate, 'yyyy-MM-dd'), 'yyyy-MM-dd'),
from_unixtime(unix_timestamp(replyDate, 'yyyy-MM-dd'), 'yyyy-MM-dd'),
-- 处理URL,去除可能的空格
trim(href),
-- 清理内容中的特殊字符
regexp_replace(trim(come_content), '[\\r\\n\\t]', ' '),
regexp_replace(trim(replay_content), '[\\r\\n\\t]', ' ')
FROM letters
WHERE
-- 过滤无效数据
originalId IS NOT NULL
AND originalId != ''
AND writeDate IS NOT NULL
AND writeDate != '';
-- 3.
-- 创建月度来信量统计表
CREATE TABLE monthly_letters_count AS
SELECT
substr(writeDate, 1, 7) as month,
COUNT(*) as letter_count,
COUNT(DISTINCT originalId) as unique_letters,
ROUND(AVG(datediff(replyDate, writeDate)), 2) as avg_reply_days
FROM letters
GROUP BY substr(writeDate, 1, 7)
ORDER BY month;
-- 查看结果
SELECT * FROM monthly_letters_count;
-- 创建来信类型统计表
CREATE TABLE letter_type_stats AS
SELECT
letterTypeName as type,
COUNT(*) as total_count,
ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM letters_cleaned), 2) as percentage,
ROUND(AVG(datediff(replyDate, writeDate)), 2) as avg_reply_days
FROM letters_cleaned
GROUP BY letterTypeName
ORDER BY total_count DESC;
-- 查看结果
SELECT * FROM letter_type_stats;
浙公网安备 33010602011771号