北京信箱实践

  1. 采集北京市政百姓信件内容;
  2. 编写MapReduce程序清洗信件内容数据;
  3. 利用HiveSql语句离线分析信件内容数据;
  4. 利用Sqoop导出Hive分析数据到MySQL库;
  5. 开发JavaWeb+ECharts 完成信件数据图表展示过程。

数据导出到MYSQL

首先本地MYSQL创建对应的表

-- 创建月度统计表
CREATE TABLE monthly_letters_count (
    month VARCHAR(7),
    letter_count INT,
    unique_letters INT,
    avg_reply_days DECIMAL(10,2),
    PRIMARY KEY (month)
);

-- 创建来信类型统计表
CREATE TABLE letter_type_stats (
    type VARCHAR(50),
    total_count INT,
    percentage DECIMAL(5,2),
    avg_reply_days DECIMAL(10,2),
    PRIMARY KEY (type)
);


-- 创建清洗后的信件表
CREATE TABLE letters_cleaned (
    originalId VARCHAR(50),
    letterTypeName VARCHAR(50),
    letterTitle VARCHAR(255),
    writeDate DATE,
    replyDate DATE,
    href TEXT,
    come_content TEXT,
    replay_content TEXT,
    PRIMARY KEY (originalId)
);

接着依次导出


# 导出月度统计表
sqoop export \
--connect "jdbc:mysql://192.168.0.106:3306/beijing_letters?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC" \
--username root \
--password 123456 \
--table monthly_letters_count \
--export-dir /user/hive/warehouse/first.db/monthly_letters_count \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'

# 导出来信类型统计表
sqoop export \
--connect "jdbc:mysql://192.168.0.106:3306/beijing_letters?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC" \
--username root \
--password 123456 \
--table letter_type_stats \
--export-dir /user/hive/warehouse/first.db/letter_type_stats \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'

# 导出清洗后的信件表
sqoop export \
--connect "jdbc:mysql://192.168.0.106:3306/beijing_letters?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC" \
--username root \
--password 123456 \
--table letters_cleaned \
--export-dir /user/hive/warehouse/first.db/letters_cleaned \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'

posted on 2025-01-26 17:40  许七安gyg  阅读(4)  评论(0)    收藏  举报