北京信箱实践

  1. 采集北京市政百姓信件内容;
  2. 编写MapReduce程序清洗信件内容数据;
  3. 利用HiveSql语句离线分析信件内容数据;
  4. 利用Sqoop导出Hive分析数据到MySQL库;
  5. 开发JavaWeb+ECharts 完成信件数据图表展示过程。

数据清洗以及分析信件内容

今天继续完成接下来的部分
爬取的数据
img
目前这数据抖挺好的,我觉得不用进行清洗,直接导入

首先创建表

CREATE TABLE letters (
    originalId STRING,
    letterTypeName STRING,
    letterTitle STRING,
    writeDate STRING,
    replyDate STRING,
    href STRING,
    come_content STRING,
    replay_content STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS TEXTFILE;

通过finalshell将文件上传到虚拟机,之后进行加载数据

LOAD DATA LOCAL INPATH '/export/data/letters.csv' OVERWRITE INTO TABLE letters;

-- 删除标题行
INSERT OVERWRITE TABLE letters
SELECT * FROM letters
WHERE originalId != 'originalId';

img
删除标题行后
img

数据处理

-- 创建新表存储清洗后的数据
CREATE TABLE letters_cleaned (
    originalId STRING,
    letterTypeName STRING,
    letterTitle STRING,
    writeDate DATE,    -- 转换为DATE类型
    replyDate DATE,    -- 转换为DATE类型
    href STRING,
    come_content STRING,
    replay_content STRING
)
STORED AS TEXTFILE;

-- 插入清洗后的数据
INSERT OVERWRITE TABLE letters_cleaned
SELECT
    trim(originalId),  -- 去除空格
    trim(letterTypeName),
    trim(letterTitle),
    -- 转换日期格式
    from_unixtime(unix_timestamp(writeDate, 'yyyy-MM-dd'), 'yyyy-MM-dd'),
    from_unixtime(unix_timestamp(replyDate, 'yyyy-MM-dd'), 'yyyy-MM-dd'),
    -- 处理URL,去除可能的空格
    trim(href),
    -- 清理内容中的特殊字符
    regexp_replace(trim(come_content), '[\\r\\n\\t]', ' '),
    regexp_replace(trim(replay_content), '[\\r\\n\\t]', ' ')
FROM letters
WHERE
    -- 过滤无效数据
    originalId IS NOT NULL
    AND originalId != ''
    AND writeDate IS NOT NULL
    AND writeDate != '';

-- 3.
-- 创建月度来信量统计表
CREATE TABLE monthly_letters_count AS
SELECT
    substr(writeDate, 1, 7) as month,
    COUNT(*) as letter_count,
    COUNT(DISTINCT originalId) as unique_letters,
    ROUND(AVG(datediff(replyDate, writeDate)), 2) as avg_reply_days
FROM letters
GROUP BY substr(writeDate, 1, 7)
ORDER BY month;

-- 查看结果
SELECT * FROM monthly_letters_count;

-- 创建来信类型统计表
CREATE TABLE letter_type_stats AS
SELECT
    letterTypeName as type,
    COUNT(*) as total_count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM letters_cleaned), 2) as percentage,
    ROUND(AVG(datediff(replyDate, writeDate)), 2) as avg_reply_days
FROM letters_cleaned
GROUP BY letterTypeName
ORDER BY total_count DESC;

-- 查看结果
SELECT * FROM letter_type_stats;
posted on 2025-01-25 15:44  许七安gyg  阅读(5)  评论(0)    收藏  举报