利用 HiveSql 语句离线分析信件内容数据以及利用 Sqoop 导出 Hive 分析数据到 MySQL 库;
首先我采用一个数据进行实验,过程也是从0开始包括对spark环境的配置以及sqoop的运用:
时效性分析(部门平均处理时长)
CREATE TABLE analysis_duration AS SELECT org, AVG(duration_days) AS avg_days, PERCENTILE_APPROX(duration_days, 0.5) AS median_days FROM cleaned_letters WHERE duration_days >= 0 GROUP BY org ORDER BY avg_days DESC;
高频关键词分析
CREATE TABLE analysis_keywords AS SELECT keyword, COUNT(*) AS frequency FROM ( SELECT EXPLODE(SPLIT(content, ' ')) AS keyword FROM cleaned_letters WHERE content IS NOT NULL ) t WHERE LENGTH(keyword) >= 2 GROUP BY keyword ORDER BY frequency DESC LIMIT 100;
部门处理量统计
CREATE TABLE analysis_org_count AS
SELECT
org,
COUNT(*) AS total_letters
FROM cleaned_letters
GROUP BY org
ORDER BY total_letters DESC;
sqoop export \
--connect jdbc:mysql://103.8.71.176:64525/gavernment \
--username root \
--password Q1234509876dwt \
--table analysis_duration \
--export-dir /user/hive/warehouse/analysis_duration \
--input-fields-terminated-by '\001' \
--input-lines-terminated-by '\n' \
--columns "org,avg_days,median_days"


浙公网安备 33010602011771号