北京信箱实践
- 采集北京市政百姓信件内容;
- 编写MapReduce程序清洗信件内容数据;
- 利用HiveSql语句离线分析信件内容数据;
- 利用Sqoop导出Hive分析数据到MySQL库;
- 开发JavaWeb+ECharts 完成信件数据图表展示过程。
数据导出到MYSQL
首先本地MYSQL创建对应的表
-- 创建月度统计表
CREATE TABLE monthly_letters_count (
month VARCHAR(7),
letter_count INT,
unique_letters INT,
avg_reply_days DECIMAL(10,2),
PRIMARY KEY (month)
);
-- 创建来信类型统计表
CREATE TABLE letter_type_stats (
type VARCHAR(50),
total_count INT,
percentage DECIMAL(5,2),
avg_reply_days DECIMAL(10,2),
PRIMARY KEY (type)
);
-- 创建清洗后的信件表
CREATE TABLE letters_cleaned (
originalId VARCHAR(50),
letterTypeName VARCHAR(50),
letterTitle VARCHAR(255),
writeDate DATE,
replyDate DATE,
href TEXT,
come_content TEXT,
replay_content TEXT,
PRIMARY KEY (originalId)
);
接着依次导出
# 导出月度统计表
sqoop export \
--connect "jdbc:mysql://192.168.0.106:3306/beijing_letters?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC" \
--username root \
--password 123456 \
--table monthly_letters_count \
--export-dir /user/hive/warehouse/first.db/monthly_letters_count \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'
# 导出来信类型统计表
sqoop export \
--connect "jdbc:mysql://192.168.0.106:3306/beijing_letters?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC" \
--username root \
--password 123456 \
--table letter_type_stats \
--export-dir /user/hive/warehouse/first.db/letter_type_stats \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'
# 导出清洗后的信件表
sqoop export \
--connect "jdbc:mysql://192.168.0.106:3306/beijing_letters?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC" \
--username root \
--password 123456 \
--table letters_cleaned \
--export-dir /user/hive/warehouse/first.db/letters_cleaned \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n'
浙公网安备 33010602011771号