hive_面试题 【累计求和-分组问题】

需求1 : 某个用户连续的访问记录如果时间间隔小于 60 秒,则分为同一个组
(同一个组内的数据,访问时间间隔小于60s)

1. 数据准备
-- DDL
create table test3 (
`id` string comment '用户id',
`showtime` string comment '访问时间')
 comment '用户访问记录表'
row format delimited fields terminated by '\t'
lines terminated by '\n' stored as orc;

-- 插入数据
insert overwrite table test3
select '1001' as id,'17523641234' as showtime union all
select '1001' as id,'17523641256' as showtime union all
select '1002' as id,'17523641278' as showtime union all
select '1001' as id,'17523641334' as showtime union all
select '1002' as id,'17523641434' as showtime union all
select '1001' as id,'17523641534' as showtime union all
select '1001' as id,'17523641544' as showtime union all
select '1002' as id,'17523641634' as showtime union all
select '1001' as id,'17523641638' as showtime union all
select '1001' as id,'17523641654' as showtime ;

-- 数据说明
-- 电商公司用户访问时间数据,每访问一次则会产生一条记录
2. 思路 
with t1 as (
select
id
,showtime
,lag(showtime,1,showtime) over(partition by id order by showtime asc) as pre_showtime -- 当前行的前一行的showtime
,cast(showtime - lag(showtime,1,showtime) over(partition by id order by showtime asc) as int)  as time_diff
from test3)
select
id
,showtime
,time_diff
,if(time_diff>=60,1,0) as groupby_flag
,sum(if(time_diff>=60,1,0)) over(partition by id order by showtime asc) groupid
from t1
;

id      showtime        time_diff       groupby_flag    groupid
1001    17523641234     0               0               0
1001    17523641256     22              0               0
1001    17523641334     78              1               1
1001    17523641534     200             1               2
1001    17523641544     10              0               2
1001    17523641638     94              1               3
1001    17523641654     16              0               3
1002    17523641278     0               0               0
1002    17523641434     156             1               1
1002    17523641634     200             1               2

需求2 : 求连续点击三次的用户数,中间不能有别人的点击 (最大连续天数的变形问题) 腾讯面试

1. 数据准备

-- DDL
create table atab (
`usr_id` string comment '用户id',
`click_time` bigint comment '点击事件')
 comment '组件A点击流水信息表'
row format delimited fields terminated by '\t'
lines terminated by '\n' stored as orc;

-- DML
insert overwrite table atab
select 'a' as usr_id,'175236412341' as click_time union all
select 'a' as usr_id,'175236412351' as click_time union all
select 'b' as usr_id,'175236412361' as click_time union all
select 'a' as usr_id,'175236412371' as click_time union all
select 'a' as usr_id,'175236412381' as click_time union all
select 'a' as usr_id,'175236412391' as click_time union all
select 'a' as usr_id,'1752364123101' as click_time union all
select 'c' as usr_id,'1752364123111' as click_time union all
select 'c' as usr_id,'1752364123121' as click_time
;

-- 数据说明
-- A表记录了点击的流水信息,包括用户id,和点击时间

2.思路分析1(获取前置元素 累计求和分组法)

-- 分析
usr_id  click_time
a       175236412341    a   0    0
a       175236412351    a   0    0
b       175236412361    a   1    1
a       175236412371    b   1    2
a       175236412381    a   0    2
a       175236412391    a   0    2
a       1752364123101   a   0    2
c       1752364123111   a   1    3
c       1752364123121   c   0    3

需求 : 求连续点击三次的用户数,中间不能有别人的点击

-- 获取前置元素 累计求和分组法
-- 1. 对 click_time 正向排序
-- 2. 获取前一行的usr_id(当前行) 名为 pre_usr_id
-- 3. 判断 pre_usr_id 和 usr_id 是否相同,相同标记为0,不同标记为0
-- 4. 对 上述标记 累计求和
-- 5. 对 求和 进行分组

3. 实现1sql

select
     usr_id
    ,groupid
    ,count(1) as cont_cnt
from (
    -- 累计求和,获取 groupid
    select
    usr_id
    ,click_time
    ,sum(if_equal) over (order by click_time asc) as groupid
    from (
        select
            usr_id
            ,click_time
             -- 全表 对click_time排序
            , lag(usr_id,1,usr_id) over(order by click_time asc) as pre_usr_id
            -- 判断 pre_usr_id 和 usr_id 是否相同,相同标记为0,不同标记为0
            ,if(usr_id = lag(usr_id,1,usr_id) over(order by click_time asc),0,1) as if_equal
        from atab
    ) as t1
) as t2
group by usr_id
    ,groupid
having count(1) >= 3
;

-- 查询结果
usr_id  groupid cont_cnt
a       2       4
Time taken: 38.347 seconds, Fetched: 1 row(s)

 2. 思路分析2(双重排序差值法)

-- 分析
-- 双重排序差值法
-- 1. 全局对 click_time 排序
-- 2. 分组usr_id 对click_time 排序
-- 3. 对 rank1 和rank2 求差值
-- 4. 对差值分组求和

usr_id  click_time      rank_1  rank_2
a       175236412341    1       1   0
a       175236412351    2       2   0
a       175236412371    4       3   1
a       175236412381    5       4   1
a       175236412391    6       5   1
a       1752364123101   7       6   1
b       175236412361    3       1   2
c       1752364123111   8       1   7
c       1752364123121   9       2   7

3. 实现2sql

select
       diff
     ,usr_id
     ,count(diff)
from
(
   select *, rank_1 - rank_2  as diff
   from (
      select usr_id
            ,click_time
            ,row_number() over(order by click_time) as rank_1
            ,row_number() over(partition by usr_id order by click_time) as rank_2
      from atab
   ) t1
) t2
group by diff,usr_id
having count(diff) >=3
;

-- 查询结果
diff    usr_id  _c2
1       a       4
Time taken: 37.683 seconds, Fetched: 1 row(s)

 














 

posted @ 2022-02-16 19:02  学而不思则罔!  阅读(870)  评论(0)    收藏  举报