关于hive日志数据分析相关操作
昨天项目——日志数据分析
-- 用于清洗的表 create table data1 ( `ip` string comment '城市', `date1` string comment '日期', `day` string comment '天数', `traffic` double comment '流量', `type` string comment '类型:视频video或文章article', `id` string comment '视频或者文章的id' ) row format delimited fields terminated by ',' lines terminated by '\n'; -- 上传 load data inpath '/hivedata/result.txt' into table data1; -- 测试 select * from data1 limit 100; -- 删除表(测试) drop table data1; -- 清洗 insert overwrite table data1 select ip, date_format(from_unixtime(unix_timestamp(date1, 'dd/MMM/yyyy:HH:mm:ss Z'), 'yyyy-MM-dd HH:mm:ss'), 'yyyy-MM-dd HH:mm:ss') as date1, day, traffic, type, id from data1; -- 为视频(video)和文章(article)分别统计访问次数,然后合并结果并按访问次数降序排列,选择前10个记录 CREATE TABLE top_visits AS SELECT type, id, COUNT(*) AS visit_count FROM data1 GROUP BY type, id ORDER BY visit_count DESC LIMIT 10; select * from top_visits; -- 最受欢迎ip create table top_ip AS SELECT ip, count(*) AS ip_count FROM data1 group by ip ORDER BY ip_count DESC LIMIT 10; select * from top_ip; -- 流量 create table top_traffic AS SELECT type, id, count(*) as traffic_count FROM data1 group by type, id ORDER BY traffic_count DESC LIMIT 10; select * from top_traffic;

浙公网安备 33010602011771号