实验三

任务一:导入数据

  • 创建两个数据表
    • Hive QL代码
      USE default;  -- 选择使用名为 default 的数据库
      
      DROP TABLE IF EXISTS ccf_offline_stage1_train;
      CREATE TABLE ccf_offline_stage1_train (
        user_id STRING,
        merchant_id STRING,
        coupon_id STRING,
        discount_rate STRING,
        distance STRING,
        date_received STRING,
        `date` STRING
      ) -- 创建新表,包含7个字段
      -- 这里 data 要打 `` 的原因是data是关键字
      ROW FORMAT DELIMITED
      FIELDS TERMINATED BY ','  -- 指定数据文件使用逗号(,)作为字段分隔符(CSV格式)
      STORED AS TEXTFILE  -- 数据以文本文件格式存储
      TBLPROPERTIES (
        "skip.header.line.count"="1"
      ); -- 表属性设置:跳过外部数据文件的第一行(通常是标题行)
      
      DROP TABLE IF EXISTS ccf_online_stage1_train;
      CREATE TABLE ccf_online_stage1_train (
        user_id STRING,
        merchant_id STRING,
        action STRING,
        coupon_id STRING,
        discount_rate STRING,
        date_received STRING,
        `date` STRING
      )
      ROW FORMAT DELIMITED
      FIELDS TERMINATED BY ','
      STORED AS TEXTFILE
      TBLPROPERTIES (
        "skip.header.line.count"="1"
      );
      
      LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_offline_stage1_train.csv'
      -- 从本地(LOCAL)文件系统(不是HDFS)加载文件
      OVERWRITE INTO TABLE ccf_offline_stage1_train;
      -- 覆盖写入新数据到之前的表中
      
      LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_online_stage1_train.csv'
      OVERWRITE INTO TABLE ccf_online_stage1_train;
      
    • 执行结果
      image
  • 查询验证数据加载成功
    • Hive QL代码
      USE default;
      
      SELECT * FROM ccf_offline_stage1_train LIMIT 5;
      -- 从 ccf_offline_stage1_train 表中查询所有字段(* 表示所有列),只返回前五列
      SELECT * FROM ccf_online_stage1_train LIMIT 5;
      SELECT COUNT(*) FROM ccf_offline_stage1_train;
      -- 统计 ccf_offline_stage1_train 表中的总行数
      SELECT COUNT(*) FROM ccf_online_stage1_train;
      
      
    • 执行结果
      • 查询
        image
      • 计数
        image
        image

任务二:基本数据查询

  • 查询⽤户⾏为数量
    • Hive QL代码
      USE default;
      
      SELECT
        CASE action
      	WHEN '0' THEN 'CLICK'
      	WHEN '1' THEN 'BUY'
      	WHEN '2' THEN 'RECEIVE'
      	ELSE CONCAT('UNKNOWN_', action)
        END AS behavior, -- 将原始的动作编码转换为有意义的英文名称
        COUNT(*) AS total_count -- 计算每种行为出现的总次数
      FROM ccf_online_stage1_train -- 从名为 ccf_online_stage1_train 的数据表中读取数据
      GROUP BY action -- 按原始的动作编码进行分组统计
      ORDER BY total_count DESC; -- 按行为数量从高到低排序
      
    • 举例
      • 假设原始数据是:0,0,1,0,2,1
      • 分组之后
        组1 (action='0'): 记录1, 记录2, 记录4
        组2 (action='1'): 记录3, 记录6
        组3 (action='2'): 记录5
        
      • 对每个组分别统计行数
        组1: COUNT(*) = 3
        组2: COUNT(*) = 2  
        组3: COUNT(*) = 1
        
    • 执行结果
      image
  • 查询指定商家优惠券使⽤情况
    • Hive QL代码
      USE default;
      
      DROP TABLE IF EXISTS onlineconsumption_table;
      CREATE TABLE onlineconsumption_table AS -- 这里的AS是创建表的语法关键字,表示"基于查询结果创建表"
      SELECT
        merchant_id,
        SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NULL THEN 1 ELSE 0 END) AS negative_samples,
        SUM(CASE WHEN coupon_id IS NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS normal_consumption,
        SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS positive_samples
      FROM ccf_online_stage1_train
      GROUP BY merchant_id;
      
      SELECT * FROM onlineconsumption_table LIMIT 20;
      
    • 执行结果
      image

任务三:数据聚合分析

  • 商家周边活跃顾客数量统计
    • Hive QL代码
      USE default;
      
      DROP TABLE IF EXISTS merchant_distance_active_customers;
      CREATE TABLE merchant_distance_active_customers AS
      SELECT
        merchant_id,
        NVL(distance, 'UNKNOWN') AS distance_bucket,
        COUNT(DISTINCT user_id) AS active_customer_count
      FROM ccf_offline_stage1_train
      GROUP BY merchant_id, NVL(distance, 'UNKNOWN');
      
      SELECT
        merchant_id,
        distance_bucket,
        active_customer_count
      FROM merchant_distance_active_customers
      ORDER BY merchant_id, distance_bucket
      LIMIT 50;
      
    • 执行结果
      image
      image
      其中第一列是商家ID,第二列是距离x,第三列是数目
  • 商家正样本比例统计
    • Hive QL代码
      USE default;
      
      DROP TABLE IF EXISTS merchant_positive_ratio;
      CREATE TABLE merchant_positive_ratio AS
      SELECT
        merchant_id,
        positive_samples,
        negative_samples,
        normal_consumption,
        (negative_samples + normal_consumption + positive_samples) AS total_samples,
        CASE
      	WHEN (negative_samples + normal_consumption + positive_samples) > 0 THEN
      	  positive_samples / CAST((negative_samples + normal_consumption + positive_samples) AS DOUBLE)
      	ELSE 0.0
        END AS positive_ratio
      FROM onlineconsumption_table;
      
      SELECT
        merchant_id,
        positive_ratio,
        positive_samples,
        total_samples
      FROM merchant_positive_ratio
      ORDER BY positive_ratio DESC, positive_samples DESC
      LIMIT 10;
      
    • 执行结果
      image

任务四:复杂查询与分析

  • 优惠券使用时间统计
    • Hive QL代码
      USE default;
      
      DROP TABLE IF EXISTS coupon_usage_interval_stats;
      CREATE TABLE coupon_usage_interval_stats AS
      WITH offline_clean AS (
        SELECT
      	CASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id,
      	CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed,
      	CASE WHEN date_received IS NOT NULL AND date_received != 'null' THEN date_received ELSE NULL END AS date_received
        FROM ccf_offline_stage1_train
      ),
      coupon_usage AS (
        SELECT
      	coupon_id,
      	DATEDIFF(
      	  TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_consumed, 'yyyyMMdd'))),
      	  TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_received, 'yyyyMMdd')))
      	) AS usage_interval
        FROM offline_clean
        WHERE coupon_id IS NOT NULL
      	AND date_consumed IS NOT NULL
      	AND date_received IS NOT NULL
      ),
      usage_counts AS (
        SELECT
      	coupon_id,
      	COUNT(*) AS usage_count,
      	AVG(usage_interval) AS avg_usage_interval
        FROM coupon_usage
        GROUP BY coupon_id
      ),
      total_usage AS (
        SELECT SUM(usage_count) AS total_usage_count FROM usage_counts
      )
      SELECT
        u.coupon_id,
        u.usage_count,
        ROUND(u.avg_usage_interval, 2) AS avg_usage_interval
      FROM usage_counts u
      CROSS JOIN total_usage t
      WHERE u.usage_count > t.total_usage_count * 0.01;
      
      SELECT
        coupon_id,
        usage_count,
        avg_usage_interval
      FROM coupon_usage_interval_stats
      ORDER BY usage_count DESC, coupon_id
      LIMIT 100;
      
    • 执行结果
      image
  • 优惠券折扣率统计
    • Hive QL代码
      USE default;
      
      DROP TABLE IF EXISTS coupon_usage_rate_stats;
      CREATE TABLE coupon_usage_rate_stats AS
      WITH offline_clean AS (
        SELECT
      	CASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id,
      	CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed,
      	CASE WHEN discount_rate IS NOT NULL AND discount_rate != 'null' THEN discount_rate ELSE NULL END AS discount_rate
        FROM ccf_offline_stage1_train
      ),
      coupon_totals AS (
        SELECT
      	coupon_id,
      	MAX(discount_rate) AS discount_rate,
      	COUNT(*) AS total_count,
      	SUM(CASE WHEN date_consumed IS NOT NULL THEN 1 ELSE 0 END) AS used_count
        FROM offline_clean
        WHERE coupon_id IS NOT NULL
        GROUP BY coupon_id
      ),
      final_stats AS (
        SELECT
      	coupon_id,
      	used_count,
      	total_count,
      	CASE WHEN total_count > 0 THEN used_count / CAST(total_count AS DOUBLE) ELSE 0.0 END AS usage_rate,
      	CASE
      	  WHEN discount_rate LIKE '%:%' THEN 'threshold'
      	  ELSE 'direct'
      	END AS discount_type,
      	CASE
      	  WHEN discount_rate LIKE '%:%' THEN
      		CASE
      		  WHEN CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE) > 0 THEN
      			(CAST(SPLIT(discount_rate, ':')[1] AS DOUBLE) / CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE)) * 100
      		  ELSE NULL
      		END
      	  ELSE
      		(1 - CAST(discount_rate AS DOUBLE)) * 100
      	END AS discount_percentage
        FROM coupon_totals
      )
      SELECT
        coupon_id,
        ROUND(usage_rate, 4) AS usage_rate,
        used_count,
        total_count,
        ROUND(discount_percentage, 2) AS discount_percentage_value,
        discount_type
      FROM final_stats;
      
      SELECT
        coupon_id,
        usage_rate,
        used_count,
        CONCAT(CAST(discount_percentage_value AS STRING), '%') AS discount_percentage,
        discount_type
      FROM coupon_usage_rate_stats
      ORDER BY usage_rate DESC, used_count DESC, coupon_id
      LIMIT 10;
      
    • 执行结果
      image
posted @ 2025-12-06 16:48  最爱丁珰  阅读(1)  评论(0)    收藏  举报