关于hive日志数据分析相关操作

昨天项目——日志数据分析

-- 用于清洗的表
create table data1
(
    `ip`      string comment '城市',
    `date1`   string comment '日期',
    `day`     string comment '天数',
    `traffic` double comment '流量',
    `type`    string comment '类型:视频video或文章article',
    `id`      string comment '视频或者文章的id'
)
    row format delimited
        fields terminated by ','
        lines terminated by '\n';
-- 上传
load data inpath '/hivedata/result.txt' into table data1;
-- 测试
select *
from data1
limit 100;
-- 删除表(测试)
drop table data1;
-- 清洗
insert overwrite table data1
select ip,
       date_format(from_unixtime(unix_timestamp(date1, 'dd/MMM/yyyy:HH:mm:ss Z'), 'yyyy-MM-dd HH:mm:ss'),
                   'yyyy-MM-dd HH:mm:ss') as date1,
       day,
       traffic,
       type,
       id
from data1;
-- 为视频(video)和文章(article)分别统计访问次数,然后合并结果并按访问次数降序排列,选择前10个记录
CREATE TABLE top_visits AS
SELECT type, id, COUNT(*) AS visit_count
FROM data1
GROUP BY type, id
ORDER BY visit_count DESC
LIMIT 10;

select *
from top_visits;
-- 最受欢迎ip
create table top_ip AS
SELECT ip, count(*) AS ip_count
FROM data1
group by ip
ORDER BY ip_count DESC
LIMIT 10;

select *
from top_ip;

-- 流量
create table top_traffic AS
SELECT type, id, count(*) as traffic_count
FROM data1
group by type, id
ORDER BY traffic_count DESC
LIMIT 10;

select *
from top_traffic;

 

posted @ 2023-10-13 21:24  花伤错零  阅读(33)  评论(0)    收藏  举报