Hive数据清洗实战:从MySQL到数据湖的完整流程

环境准备与工具选择

技术栈配置

# 核心组件
- Hadoop 3.x
- Hive 3.x
- Sqoop 1.4.7
- MySQL 8.0

# 可选工具
- Airflow(调度)
- Hue(可视化)

完整数据清洗流程

步骤一:数据抽取(Sqoop从MySQL导入)

-- 1. 创建Hive外部表(链接MySQL数据)
CREATE EXTERNAL TABLE mysql_sales_raw (
    id BIGINT,
    order_no STRING,
    customer_id BIGINT,
    amount DECIMAL(10,2),
    order_date TIMESTAMP,
    status INT,
    last_update TIMESTAMP
)
COMMENT '原始销售订单表(MySQL映射)'
STORED BY 'org.apache.hadoop.hive.mysql.MySQLStorageHandler'
TBLPROPERTIES (
    "mysql.host" = "jdbc:mysql://192.168.1.100:3306/biz_db",
    "mysql.user" = "etl_user",
    "mysql.password" = "******",
    "mysql.table" = "sales_order"
);

-- 或者使用Sqoop增量导入(推荐)
sqoop import \
  --connect jdbc:mysql://192.168.1.100:3306/biz_db \
  --username etl_user \
  --password ****** \
  --table sales_order \
  --hive-import \
  --hive-table sales_order_raw \
  --incremental lastmodified \
  --check-column last_update \
  --last-value "2024-01-01 00:00:00" \
  --merge-key id

步骤二:数据质量检查与问题诊断

-- 1. 数据概要分析
CREATE TABLE data_quality_report AS
SELECT 
    'sales_order' as table_name,
    COUNT(*) as total_rows,
    COUNT(DISTINCT id) as distinct_ids,
    SUM(CASE WHEN id IS NULL THEN 1 ELSE 0 END) as null_ids,
    SUM(CASE WHEN order_no IS NULL OR order_no = '' THEN 1 ELSE 0 END) as null_order_nos,
    MIN(order_date) as min_date,
    MAX(order_date) as max_date,
    AVG(amount) as avg_amount,
    SUM(CASE WHEN amount < 0 THEN 1 ELSE 0 END) as negative_amounts
FROM mysql_sales_raw;

-- 2. 重复数据检测
SELECT 
    order_no, 
    COUNT(*) as duplicate_count,
    COLLECT_LIST(id) as duplicate_ids
FROM mysql_sales_raw
GROUP BY order_no
HAVING COUNT(*) > 1;

步骤三:数据清洗转换核心逻辑

-- 1. 创建清洗中间表
CREATE TABLE sales_order_cleaned (
    order_id BIGINT,
    order_no STRING,
    customer_id BIGINT,
    amount DECIMAL(10,2),
    order_date DATE,
    order_year INT,
    order_month INT,
    order_week INT,
    status STRING,
    amount_category STRING,
    is_valid BOOLEAN,
    invalid_reason STRING,
    etl_time TIMESTAMP
)
PARTITIONED BY (dt STRING)
STORED AS ORC
TBLPROPERTIES ("orc.compress"="SNAPPY");

-- 2. 执行清洗转换
INSERT OVERWRITE TABLE sales_order_cleaned PARTITION(dt='${hiveconf:batch_date}')
SELECT 
    id as order_id,
    -- 去除空格并标准化订单号
    UPPER(TRIM(order_no)) as order_no,
    -- 无效客户ID处理
    CASE 
        WHEN customer_id <= 0 THEN NULL
        ELSE customer_id 
    END as customer_id,
    -- 金额范围修正
    CASE 
        WHEN amount > 1000000 THEN 1000000
        WHEN amount < 0 THEN 0
        ELSE ROUND(amount, 2)
    END as amount,
    -- 日期格式标准化
    DATE(order_date) as order_date,
    YEAR(order_date) as order_year,
    MONTH(order_date) as order_month,
    WEEKOFYEAR(order_date) as order_week,
    -- 状态码转义
    CASE status
        WHEN 1 THEN 'PENDING'
        WHEN 2 THEN 'PAID'
        WHEN 3 THEN 'SHIPPED'
        WHEN 4 THEN 'COMPLETED'
        WHEN 5 THEN 'CANCELLED'
        ELSE 'UNKNOWN'
    END as status,
    -- 金额分级
    CASE 
        WHEN amount <= 100 THEN 'SMALL'
        WHEN amount <= 1000 THEN 'MEDIUM'
        WHEN amount <= 10000 THEN 'LARGE'
        ELSE 'VIP'
    END as amount_category,
    -- 数据有效性标记
    CASE 
        WHEN order_no IS NULL OR order_no = '' THEN false
        WHEN order_date IS NULL THEN false
        WHEN order_date > CURRENT_DATE() THEN false
        ELSE true
    END as is_valid,
    -- 记录无效原因
    CONCAT_WS(',',
        CASE WHEN order_no IS NULL OR order_no = '' THEN 'order_no_invalid' END,
        CASE WHEN order_date IS NULL THEN 'order_date_null' END,
        CASE WHEN order_date > CURRENT_DATE() THEN 'future_date' END
    ) as invalid_reason,
    CURRENT_TIMESTAMP() as etl_time
FROM mysql_sales_raw
WHERE dt = '${hiveconf:batch_date}';

步骤四:维度数据整合与关联

-- 1. 创建客户维度表
CREATE TABLE dim_customer AS
SELECT 
    c.id,
    c.name,
    c.email,
    c.phone,
    CASE 
        WHEN c.age < 0 THEN NULL
        WHEN c.age > 120 THEN NULL
        ELSE c.age
    END as age,
    CASE 
        WHEN c.gender NOT IN ('M','F','U') THEN 'U'
        ELSE c.gender
    END as gender,
    c.registration_date,
    c.city,
    c.country
FROM mysql_customer_raw c
WHERE c.is_active = 1;

-- 2. 创建事实表(关联维度)
CREATE TABLE fact_sales_daily (
    order_id BIGINT,
    order_no STRING,
    customer_key BIGINT,
    customer_segment STRING,
    amount DECIMAL(10,2),
    order_date DATE,
    status STRING,
    region STRING
)
PARTITIONED BY (order_year INT, order_month INT);

INSERT OVERWRITE TABLE fact_sales_daily 
PARTITION(order_year, order_month)
SELECT 
    s.order_id,
    s.order_no,
    c.id as customer_key,
    CASE 
        WHEN c.registration_date >= DATE_SUB(CURRENT_DATE(), 30) THEN 'NEW'
        WHEN s.amount > 1000 THEN 'VIP'
        ELSE 'REGULAR'
    END as customer_segment,
    s.amount,
    s.order_date,
    s.status,
    COALESCE(c.region, 'UNKNOWN') as region,
    s.order_year,
    s.order_month
FROM sales_order_cleaned s
LEFT JOIN dim_customer c ON s.customer_id = c.id
WHERE s.is_valid = true
  AND s.dt = '${hiveconf:batch_date}';

步骤五:数据质量验证与监控

-- 1. 创建数据质量监控表
CREATE TABLE data_quality_monitor (
    batch_date STRING,
    table_name STRING,
    metric_name STRING,
    metric_value DECIMAL(20,4),
    threshold_value DECIMAL(20,4),
    is_passed BOOLEAN,
    check_time TIMESTAMP
);

-- 2. 执行质量检查
INSERT INTO data_quality_monitor
SELECT 
    '${hiveconf:batch_date}' as batch_date,
    'fact_sales_daily' as table_name,
    metric_name,
    metric_value,
    threshold_value,
    metric_value >= threshold_value as is_passed,
    CURRENT_TIMESTAMP() as check_time
FROM (
    -- 完整性检查
    SELECT 
        'completeness_rate' as metric_name,
        COUNT(CASE WHEN order_no IS NOT NULL THEN 1 END) * 1.0 / COUNT(*) as metric_value,
        0.99 as threshold_value
    FROM fact_sales_daily 
    WHERE dt = '${hiveconf:batch_date}'
    
    UNION ALL
    
    -- 唯一性检查
    SELECT 
        'unique_order_rate' as metric_name,
        COUNT(DISTINCT order_no) * 1.0 / COUNT(*) as metric_value,
        1.0 as threshold_value
    FROM fact_sales_daily 
    WHERE dt = '${hiveconf:batch_date}'
    
    UNION ALL
    
    -- 准确性检查(金额为正值)
    SELECT 
        'positive_amount_rate' as metric_name,
        SUM(CASE WHEN amount > 0 THEN 1 ELSE 0 END) * 1.0 / COUNT(*) as metric_value,
        1.0 as threshold_value
    FROM fact_sales_daily 
    WHERE dt = '${hiveconf:batch_date}'
) t;

性能优化技巧

1. 分区与分桶策略

-- 按日期分区 + 客户ID分桶
CREATE TABLE fact_sales_optimized (
    -- 字段定义
)
PARTITIONED BY (order_date DATE)
CLUSTERED BY (customer_id) INTO 32 BUCKETS
STORED AS ORC;

2. 索引与压缩优化

-- 创建位图索引加速状态查询
CREATE INDEX sales_status_idx
ON TABLE fact_sales_daily (status)
AS 'BITMAP'
WITH DEFERRED REBUILD;

3. 向量化执行

-- 启用向量化查询
SET hive.vectorized.execution.enabled = true;
SET hive.vectorized.execution.reduce.enabled = true;

自动化调度脚本(Shell示例)

#!/bin/bash
# auto_cleanse_mysql_data.sh

# 环境变量
BATCH_DATE=$(date -d "-1 day" +%Y-%m-%d)
LOG_FILE="/var/log/etl/sales_cleanse_${BATCH_DATE}.log"

# 函数:记录日志
log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $LOG_FILE
}

# 主流程
main() {
    log "开始处理${BATCH_DATE}的数据清洗"
    
    # 步骤1:Sqoop增量导入
    log "执行Sqoop增量导入"
    sqoop job --exec mysql_to_hive_incr 2>&1 | tee -a $LOG_FILE
    
    # 步骤2:Hive数据清洗
    log "执行Hive清洗转换"
    hive --hivevar batch_date="${BATCH_DATE}" \
         -f /opt/etl/sql/cleanse_sales.hql 2>&1 | tee -a $LOG_FILE
    
    # 步骤3:数据质量检查
    log "执行数据质量验证"
    hive --hivevar batch_date="${BATCH_DATE}" \
         -f /opt/etl/sql/quality_check.hql 2>&1 | tee -a $LOG_FILE
    
    # 步骤4:发送通知
    if [ $? -eq 0 ]; then
        log "数据清洗完成"
        echo "ETL任务成功完成" | mail -s "ETL Success ${BATCH_DATE}" admin@example.com
    else
        log "数据清洗失败"
        echo "ETL任务失败,请检查日志" | mail -s "ETL Failed ${BATCH_DATE}" admin@example.com
        exit 1
    fi
}

# 异常处理
trap "log '脚本异常终止'; exit 1" ERR

main

常见问题与解决方案

问题1:数据量过大导致清洗缓慢

解决方案

  • 使用MapReduce中间结果压缩
  • 开启Tez或Spark执行引擎
  • 采用分阶段清洗策略

问题2:MySQL与Hive数据类型不匹配

解决方案

-- 显式类型转换
CAST(mysql_decimal AS DECIMAL(10,2)) as amount,
DATE_FORMAT(mysql_datetime, 'yyyy-MM-dd') as order_date

问题3:增量更新与历史数据合并

解决方案

-- MERGE INTO(Hive 2.2+)
MERGE INTO fact_sales_daily AS target
USING sales_order_cleaned AS source
ON target.order_id = source.order_id
WHEN MATCHED THEN 
    UPDATE SET target.amount = source.amount
WHEN NOT MATCHED THEN 
    INSERT VALUES (source.order_id, ...);

总结与最佳实践

通过以上步骤,我们完成了从MySQL到Hive的数据清洗全流程。关键点总结:

  1. 设计标准化清洗流程:抽取→诊断→清洗→转换→验证
  2. 保证数据可追溯:保留原始数据,记录清洗日志
  3. 性能与质量并重:优化查询,实施质量监控
  4. 自动化与调度:减少人工干预,提高效率
  5. 文档与维护:记录清洗规则,便于团队协作

希望这篇实战指南能帮助你构建高效的数据清洗流程。实际应用中需要根据具体业务场景调整清洗规则和优化策略。

技术无止境,数据质量永远在路上!

posted @ 2026-01-28 16:51  马瑞鑫03  阅读(0)  评论(0)    收藏  举报