实验三
任务一:导入数据
- 创建两个数据表
- Hive QL代码
USE default; -- 选择使用名为 default 的数据库 DROP TABLE IF EXISTS ccf_offline_stage1_train; CREATE TABLE ccf_offline_stage1_train ( user_id STRING, merchant_id STRING, coupon_id STRING, discount_rate STRING, distance STRING, date_received STRING, `date` STRING ) -- 创建新表,包含7个字段 -- 这里 data 要打 `` 的原因是data是关键字 ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' -- 指定数据文件使用逗号(,)作为字段分隔符(CSV格式) STORED AS TEXTFILE -- 数据以文本文件格式存储 TBLPROPERTIES ( "skip.header.line.count"="1" ); -- 表属性设置:跳过外部数据文件的第一行(通常是标题行) DROP TABLE IF EXISTS ccf_online_stage1_train; CREATE TABLE ccf_online_stage1_train ( user_id STRING, merchant_id STRING, action STRING, coupon_id STRING, discount_rate STRING, date_received STRING, `date` STRING ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE TBLPROPERTIES ( "skip.header.line.count"="1" ); LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_offline_stage1_train.csv' -- 从本地(LOCAL)文件系统(不是HDFS)加载文件 OVERWRITE INTO TABLE ccf_offline_stage1_train; -- 覆盖写入新数据到之前的表中 LOAD DATA LOCAL INPATH '/workspace/hive_coupon/ccf_online_stage1_train.csv' OVERWRITE INTO TABLE ccf_online_stage1_train; - 执行结果
![image]()
- Hive QL代码
- 查询验证数据加载成功
- Hive QL代码
USE default; SELECT * FROM ccf_offline_stage1_train LIMIT 5; -- 从 ccf_offline_stage1_train 表中查询所有字段(* 表示所有列),只返回前五列 SELECT * FROM ccf_online_stage1_train LIMIT 5; SELECT COUNT(*) FROM ccf_offline_stage1_train; -- 统计 ccf_offline_stage1_train 表中的总行数 SELECT COUNT(*) FROM ccf_online_stage1_train; - 执行结果
- 查询
![image]()
- 计数
![image]()
![image]()
- 查询
- Hive QL代码
任务二:基本数据查询
- 查询⽤户⾏为数量
- Hive QL代码
USE default; SELECT CASE action WHEN '0' THEN 'CLICK' WHEN '1' THEN 'BUY' WHEN '2' THEN 'RECEIVE' ELSE CONCAT('UNKNOWN_', action) END AS behavior, -- 将原始的动作编码转换为有意义的英文名称 COUNT(*) AS total_count -- 计算每种行为出现的总次数 FROM ccf_online_stage1_train -- 从名为 ccf_online_stage1_train 的数据表中读取数据 GROUP BY action -- 按原始的动作编码进行分组统计 ORDER BY total_count DESC; -- 按行为数量从高到低排序 - 举例
- 假设原始数据是:
0,0,1,0,2,1 - 分组之后
组1 (action='0'): 记录1, 记录2, 记录4 组2 (action='1'): 记录3, 记录6 组3 (action='2'): 记录5 - 对每个组分别统计行数
组1: COUNT(*) = 3 组2: COUNT(*) = 2 组3: COUNT(*) = 1
- 假设原始数据是:
- 执行结果
![image]()
- Hive QL代码
- 查询指定商家优惠券使⽤情况
- Hive QL代码
USE default; DROP TABLE IF EXISTS onlineconsumption_table; CREATE TABLE onlineconsumption_table AS -- 这里的AS是创建表的语法关键字,表示"基于查询结果创建表" SELECT merchant_id, SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NULL THEN 1 ELSE 0 END) AS negative_samples, SUM(CASE WHEN coupon_id IS NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS normal_consumption, SUM(CASE WHEN coupon_id IS NOT NULL AND `date` IS NOT NULL THEN 1 ELSE 0 END) AS positive_samples FROM ccf_online_stage1_train GROUP BY merchant_id; SELECT * FROM onlineconsumption_table LIMIT 20; - 执行结果
![image]()
- Hive QL代码
任务三:数据聚合分析
- 商家周边活跃顾客数量统计
- Hive QL代码
USE default; DROP TABLE IF EXISTS merchant_distance_active_customers; CREATE TABLE merchant_distance_active_customers AS SELECT merchant_id, NVL(distance, 'UNKNOWN') AS distance_bucket, COUNT(DISTINCT user_id) AS active_customer_count FROM ccf_offline_stage1_train GROUP BY merchant_id, NVL(distance, 'UNKNOWN'); SELECT merchant_id, distance_bucket, active_customer_count FROM merchant_distance_active_customers ORDER BY merchant_id, distance_bucket LIMIT 50; - 执行结果
![image]()
![image]()
其中第一列是商家ID,第二列是距离x,第三列是数目
- Hive QL代码
- 商家正样本比例统计
- Hive QL代码
USE default; DROP TABLE IF EXISTS merchant_positive_ratio; CREATE TABLE merchant_positive_ratio AS SELECT merchant_id, positive_samples, negative_samples, normal_consumption, (negative_samples + normal_consumption + positive_samples) AS total_samples, CASE WHEN (negative_samples + normal_consumption + positive_samples) > 0 THEN positive_samples / CAST((negative_samples + normal_consumption + positive_samples) AS DOUBLE) ELSE 0.0 END AS positive_ratio FROM onlineconsumption_table; SELECT merchant_id, positive_ratio, positive_samples, total_samples FROM merchant_positive_ratio ORDER BY positive_ratio DESC, positive_samples DESC LIMIT 10; - 执行结果
![image]()
- Hive QL代码
任务四:复杂查询与分析
- 优惠券使用时间统计
- Hive QL代码
USE default; DROP TABLE IF EXISTS coupon_usage_interval_stats; CREATE TABLE coupon_usage_interval_stats AS WITH offline_clean AS ( SELECT CASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id, CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed, CASE WHEN date_received IS NOT NULL AND date_received != 'null' THEN date_received ELSE NULL END AS date_received FROM ccf_offline_stage1_train ), coupon_usage AS ( SELECT coupon_id, DATEDIFF( TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_consumed, 'yyyyMMdd'))), TO_DATE(FROM_UNIXTIME(UNIX_TIMESTAMP(date_received, 'yyyyMMdd'))) ) AS usage_interval FROM offline_clean WHERE coupon_id IS NOT NULL AND date_consumed IS NOT NULL AND date_received IS NOT NULL ), usage_counts AS ( SELECT coupon_id, COUNT(*) AS usage_count, AVG(usage_interval) AS avg_usage_interval FROM coupon_usage GROUP BY coupon_id ), total_usage AS ( SELECT SUM(usage_count) AS total_usage_count FROM usage_counts ) SELECT u.coupon_id, u.usage_count, ROUND(u.avg_usage_interval, 2) AS avg_usage_interval FROM usage_counts u CROSS JOIN total_usage t WHERE u.usage_count > t.total_usage_count * 0.01; SELECT coupon_id, usage_count, avg_usage_interval FROM coupon_usage_interval_stats ORDER BY usage_count DESC, coupon_id LIMIT 100; - 执行结果
![image]()
- Hive QL代码
- 优惠券折扣率统计
- Hive QL代码
USE default; DROP TABLE IF EXISTS coupon_usage_rate_stats; CREATE TABLE coupon_usage_rate_stats AS WITH offline_clean AS ( SELECT CASE WHEN coupon_id IS NOT NULL AND coupon_id != 'null' THEN coupon_id ELSE NULL END AS coupon_id, CASE WHEN `date` IS NOT NULL AND `date` != 'null' THEN `date` ELSE NULL END AS date_consumed, CASE WHEN discount_rate IS NOT NULL AND discount_rate != 'null' THEN discount_rate ELSE NULL END AS discount_rate FROM ccf_offline_stage1_train ), coupon_totals AS ( SELECT coupon_id, MAX(discount_rate) AS discount_rate, COUNT(*) AS total_count, SUM(CASE WHEN date_consumed IS NOT NULL THEN 1 ELSE 0 END) AS used_count FROM offline_clean WHERE coupon_id IS NOT NULL GROUP BY coupon_id ), final_stats AS ( SELECT coupon_id, used_count, total_count, CASE WHEN total_count > 0 THEN used_count / CAST(total_count AS DOUBLE) ELSE 0.0 END AS usage_rate, CASE WHEN discount_rate LIKE '%:%' THEN 'threshold' ELSE 'direct' END AS discount_type, CASE WHEN discount_rate LIKE '%:%' THEN CASE WHEN CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE) > 0 THEN (CAST(SPLIT(discount_rate, ':')[1] AS DOUBLE) / CAST(SPLIT(discount_rate, ':')[0] AS DOUBLE)) * 100 ELSE NULL END ELSE (1 - CAST(discount_rate AS DOUBLE)) * 100 END AS discount_percentage FROM coupon_totals ) SELECT coupon_id, ROUND(usage_rate, 4) AS usage_rate, used_count, total_count, ROUND(discount_percentage, 2) AS discount_percentage_value, discount_type FROM final_stats; SELECT coupon_id, usage_rate, used_count, CONCAT(CAST(discount_percentage_value AS STRING), '%') AS discount_percentage, discount_type FROM coupon_usage_rate_stats ORDER BY usage_rate DESC, used_count DESC, coupon_id LIMIT 10; - 执行结果
![image]()
- Hive QL代码












浙公网安备 33010602011771号