3月7日
日志分析全流程
-- 1. 原始日志清洗 CREATE TABLE cleaned_logs AS SELECT user_id, action, duration FROM raw_logs WHERE duration > 0; -- 2. 按用户分区分桶存储 CREATE TABLE user_stats PARTITIONED BY (dt STRING) CLUSTERED BY (user_id) INTO 32 BUCKETS AS SELECT user_id, AVG(duration) AS avg_duration FROM cleaned_logs GROUP BY user_id; -- 3. 输出报表 INSERT OVERWRITE LOCAL DIRECTORY '/report' SELECT dt, user_id, avg_duration FROM user_stats ORDER BY avg_duration DESC;
https://www.bilibili.com/video/BV1W4411B7cB

浙公网安备 33010602011771号