Hive数据清洗实战:从MySQL到数据湖的完整流程
环境准备与工具选择
技术栈配置
# 核心组件
- Hadoop 3.x
- Hive 3.x
- Sqoop 1.4.7
- MySQL 8.0
# 可选工具
- Airflow(调度)
- Hue(可视化)
完整数据清洗流程
步骤一:数据抽取(Sqoop从MySQL导入)
-- 1. 创建Hive外部表(链接MySQL数据)
CREATE EXTERNAL TABLE mysql_sales_raw (
id BIGINT,
order_no STRING,
customer_id BIGINT,
amount DECIMAL(10,2),
order_date TIMESTAMP,
status INT,
last_update TIMESTAMP
)
COMMENT '原始销售订单表(MySQL映射)'
STORED BY 'org.apache.hadoop.hive.mysql.MySQLStorageHandler'
TBLPROPERTIES (
"mysql.host" = "jdbc:mysql://192.168.1.100:3306/biz_db",
"mysql.user" = "etl_user",
"mysql.password" = "******",
"mysql.table" = "sales_order"
);
-- 或者使用Sqoop增量导入(推荐)
sqoop import \
--connect jdbc:mysql://192.168.1.100:3306/biz_db \
--username etl_user \
--password ****** \
--table sales_order \
--hive-import \
--hive-table sales_order_raw \
--incremental lastmodified \
--check-column last_update \
--last-value "2024-01-01 00:00:00" \
--merge-key id
步骤二:数据质量检查与问题诊断
-- 1. 数据概要分析
CREATE TABLE data_quality_report AS
SELECT
'sales_order' as table_name,
COUNT(*) as total_rows,
COUNT(DISTINCT id) as distinct_ids,
SUM(CASE WHEN id IS NULL THEN 1 ELSE 0 END) as null_ids,
SUM(CASE WHEN order_no IS NULL OR order_no = '' THEN 1 ELSE 0 END) as null_order_nos,
MIN(order_date) as min_date,
MAX(order_date) as max_date,
AVG(amount) as avg_amount,
SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) as negative_amounts
FROM mysql_sales_raw;
-- 2. 重复数据检测
SELECT
order_no,
COUNT(*) as duplicate_count,
COLLECT_LIST(id) as duplicate_ids
FROM mysql_sales_raw
GROUP BY order_no
HAVING COUNT(*) > 1;
步骤三:数据清洗转换核心逻辑
-- 1. 创建清洗中间表
CREATE TABLE sales_order_cleaned (
order_id BIGINT,
order_no STRING,
customer_id BIGINT,
amount DECIMAL(10,2),
order_date DATE,
order_year INT,
order_month INT,
order_week INT,
status STRING,
amount_category STRING,
is_valid BOOLEAN,
invalid_reason STRING,
etl_time TIMESTAMP
)
PARTITIONED BY (dt STRING)
STORED AS ORC
TBLPROPERTIES ("orc.compress"="SNAPPY");
-- 2. 执行清洗转换
INSERT OVERWRITE TABLE sales_order_cleaned PARTITION(dt='${hiveconf:batch_date}')
SELECT
id as order_id,
-- 去除空格并标准化订单号
UPPER(TRIM(order_no)) as order_no,
-- 无效客户ID处理
CASE
WHEN customer_id <= 0 THEN NULL
ELSE customer_id
END as customer_id,
-- 金额范围修正
CASE
WHEN amount > 1000000 THEN 1000000
WHEN amount < 0 THEN 0
ELSE ROUND(amount, 2)
END as amount,
-- 日期格式标准化
DATE(order_date) as order_date,
YEAR(order_date) as order_year,
MONTH(order_date) as order_month,
WEEKOFYEAR(order_date) as order_week,
-- 状态码转义
CASE status
WHEN 1 THEN 'PENDING'
WHEN 2 THEN 'PAID'
WHEN 3 THEN 'SHIPPED'
WHEN 4 THEN 'COMPLETED'
WHEN 5 THEN 'CANCELLED'
ELSE 'UNKNOWN'
END as status,
-- 金额分级
CASE
WHEN amount <= 100 THEN 'SMALL'
WHEN amount <= 1000 THEN 'MEDIUM'
WHEN amount <= 10000 THEN 'LARGE'
ELSE 'VIP'
END as amount_category,
-- 数据有效性标记
CASE
WHEN order_no IS NULL OR order_no = '' THEN false
WHEN order_date IS NULL THEN false
WHEN order_date > CURRENT_DATE() THEN false
ELSE true
END as is_valid,
-- 记录无效原因
CONCAT_WS(',',
CASE WHEN order_no IS NULL OR order_no = '' THEN 'order_no_invalid' END,
CASE WHEN order_date IS NULL THEN 'order_date_null' END,
CASE WHEN order_date > CURRENT_DATE() THEN 'future_date' END
) as invalid_reason,
CURRENT_TIMESTAMP() as etl_time
FROM mysql_sales_raw
WHERE dt = '${hiveconf:batch_date}';
步骤四:维度数据整合与关联
-- 1. 创建客户维度表
CREATE TABLE dim_customer AS
SELECT
c.id,
c.name,
c.email,
c.phone,
CASE
WHEN c.age < 0 THEN NULL
WHEN c.age > 120 THEN NULL
ELSE c.age
END as age,
CASE
WHEN c.gender NOT IN ('M','F','U') THEN 'U'
ELSE c.gender
END as gender,
c.registration_date,
c.city,
c.country
FROM mysql_customer_raw c
WHERE c.is_active = 1;
-- 2. 创建事实表(关联维度)
CREATE TABLE fact_sales_daily (
order_id BIGINT,
order_no STRING,
customer_key BIGINT,
customer_segment STRING,
amount DECIMAL(10,2),
order_date DATE,
status STRING,
region STRING
)
PARTITIONED BY (order_year INT, order_month INT);
INSERT OVERWRITE TABLE fact_sales_daily
PARTITION(order_year, order_month)
SELECT
s.order_id,
s.order_no,
c.id as customer_key,
CASE
WHEN c.registration_date >= DATE_SUB(CURRENT_DATE(), 30) THEN 'NEW'
WHEN s.amount > 1000 THEN 'VIP'
ELSE 'REGULAR'
END as customer_segment,
s.amount,
s.order_date,
s.status,
COALESCE(c.region, 'UNKNOWN') as region,
s.order_year,
s.order_month
FROM sales_order_cleaned s
LEFT JOIN dim_customer c ON s.customer_id = c.id
WHERE s.is_valid = true
AND s.dt = '${hiveconf:batch_date}';
步骤五:数据质量验证与监控
-- 1. 创建数据质量监控表
CREATE TABLE data_quality_monitor (
batch_date STRING,
table_name STRING,
metric_name STRING,
metric_value DECIMAL(20,4),
threshold_value DECIMAL(20,4),
is_passed BOOLEAN,
check_time TIMESTAMP
);
-- 2. 执行质量检查
INSERT INTO data_quality_monitor
SELECT
'${hiveconf:batch_date}' as batch_date,
'fact_sales_daily' as table_name,
metric_name,
metric_value,
threshold_value,
metric_value >= threshold_value as is_passed,
CURRENT_TIMESTAMP() as check_time
FROM (
-- 完整性检查
SELECT
'completeness_rate' as metric_name,
COUNT(CASE WHEN order_no IS NOT NULL THEN 1 END) * 1.0 / COUNT(*) as metric_value,
0.99 as threshold_value
FROM fact_sales_daily
WHERE dt = '${hiveconf:batch_date}'
UNION ALL
-- 唯一性检查
SELECT
'unique_order_rate' as metric_name,
COUNT(DISTINCT order_no) * 1.0 / COUNT(*) as metric_value,
1.0 as threshold_value
FROM fact_sales_daily
WHERE dt = '${hiveconf:batch_date}'
UNION ALL
-- 准确性检查(金额为正值)
SELECT
'positive_amount_rate' as metric_name,
SUM(CASE WHEN amount > 0 THEN 1 ELSE 0 END) * 1.0 / COUNT(*) as metric_value,
1.0 as threshold_value
FROM fact_sales_daily
WHERE dt = '${hiveconf:batch_date}'
) t;
性能优化技巧
1. 分区与分桶策略
-- 按日期分区 + 客户ID分桶
CREATE TABLE fact_sales_optimized (
-- 字段定义
)
PARTITIONED BY (order_date DATE)
CLUSTERED BY (customer_id) INTO 32 BUCKETS
STORED AS ORC;
2. 索引与压缩优化
-- 创建位图索引加速状态查询
CREATE INDEX sales_status_idx
ON TABLE fact_sales_daily (status)
AS 'BITMAP'
WITH DEFERRED REBUILD;
3. 向量化执行
-- 启用向量化查询
SET hive.vectorized.execution.enabled = true;
SET hive.vectorized.execution.reduce.enabled = true;
自动化调度脚本(Shell示例)
#!/bin/bash
# auto_cleanse_mysql_data.sh
# 环境变量
BATCH_DATE=$(date -d "-1 day" +%Y-%m-%d)
LOG_FILE="/var/log/etl/sales_cleanse_${BATCH_DATE}.log"
# 函数:记录日志
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $LOG_FILE
}
# 主流程
main() {
log "开始处理${BATCH_DATE}的数据清洗"
# 步骤1:Sqoop增量导入
log "执行Sqoop增量导入"
sqoop job --exec mysql_to_hive_incr 2>&1 | tee -a $LOG_FILE
# 步骤2:Hive数据清洗
log "执行Hive清洗转换"
hive --hivevar batch_date="${BATCH_DATE}" \
-f /opt/etl/sql/cleanse_sales.hql 2>&1 | tee -a $LOG_FILE
# 步骤3:数据质量检查
log "执行数据质量验证"
hive --hivevar batch_date="${BATCH_DATE}" \
-f /opt/etl/sql/quality_check.hql 2>&1 | tee -a $LOG_FILE
# 步骤4:发送通知
if [ $? -eq 0 ]; then
log "数据清洗完成"
echo "ETL任务成功完成" | mail -s "ETL Success ${BATCH_DATE}" admin@example.com
else
log "数据清洗失败"
echo "ETL任务失败,请检查日志" | mail -s "ETL Failed ${BATCH_DATE}" admin@example.com
exit 1
fi
}
# 异常处理
trap "log '脚本异常终止'; exit 1" ERR
main
常见问题与解决方案
问题1:数据量过大导致清洗缓慢
解决方案:
- 使用MapReduce中间结果压缩
- 开启Tez或Spark执行引擎
- 采用分阶段清洗策略
问题2:MySQL与Hive数据类型不匹配
解决方案:
-- 显式类型转换
CAST(mysql_decimal AS DECIMAL(10,2)) as amount,
DATE_FORMAT(mysql_datetime, 'yyyy-MM-dd') as order_date
问题3:增量更新与历史数据合并
解决方案:
-- MERGE INTO(Hive 2.2+)
MERGE INTO fact_sales_daily AS target
USING sales_order_cleaned AS source
ON target.order_id = source.order_id
WHEN MATCHED THEN
UPDATE SET target.amount = source.amount
WHEN NOT MATCHED THEN
INSERT VALUES (source.order_id, ...);
总结与最佳实践
通过以上步骤,我们完成了从MySQL到Hive的数据清洗全流程。关键点总结:
- 设计标准化清洗流程:抽取→诊断→清洗→转换→验证
- 保证数据可追溯:保留原始数据,记录清洗日志
- 性能与质量并重:优化查询,实施质量监控
- 自动化与调度:减少人工干预,提高效率
- 文档与维护:记录清洗规则,便于团队协作
希望这篇实战指南能帮助你构建高效的数据清洗流程。实际应用中需要根据具体业务场景调整清洗规则和优化策略。
技术无止境,数据质量永远在路上!

浙公网安备 33010602011771号