hive_面试题 【连续登入问题】

思考1: 怎样判断 有序序列 连续?(获取前置元素差值法、获取序号差值法)

 方法1 :(获取前置元素差值法)
      1. 获取当前元素的上一个元素(没有时,为当前元素本身)
            3 3

            5 3
            6 5
            7 6

            9 7
            10 9
            11 10
      2. 当前元素与前一元素求差(差值为1的表示:队中或队尾 差值!=1表示:队首)
            3 3 0

            5 3 2
            6 5 1
            7 6 1

            9 7 2
            10 9 1
            11 10 1
      3.  状态值 : 队首用1 表示,队尾用 0 表示
            3 3 0 1

            5 3 2 1
            6 5 1 0
            7 6 1 0

            9 7 2 1
            10 9 1 0
            11 10 1 0
     4. 对状态值进行累加(累加值相同的表示连续序列)
            3 3 0 1 1

            5 3 2 1 2
            6 5 1 0 2
            7 6 1 0 2

            9 7 2 1 3
            10 9 1 0 3
            11 10 1 0 3  
方法2 :(获取序号差值法)
1. 对序列进行排序,获取序号 3 1 5 2 6 3 7 4 9 5 10 6 11 7 2. 当前元素与序号做差(差值相同的表示为同一连续序列) 3 1 2 5 2 3 6 3 3 7 4 3 9 5 4 10 6 4 11 7 4

需求1 : 求支付宝-蚂蚁森林 找出连续3天及以上减少碳排放量在100以上的用户

-- 数据准备
-- DDL
create table test1 (
`id` string comment '用户id',
`occur_date` string comment '发生日期',
`lowcarbon` string comment '减少碳排放量')
 comment '减少碳排放量转换表'
row format delimited fields terminated by '\t'
lines terminated by '\n' stored as orc;

-- 数据说明
-- 支付宝-蚂蚁森林 记录运动量转换成 减少碳排放量
-- 每点击一次 会收获一次 减少碳排放量,并生成一条记录
-- 每日每人可多次点击,生成多条记录

-- DML
insert overwrite table test1
select '1001' as id,'2021-12-12' as occur_date,'123' as lowcarbon union all
select '1001' as id,'2021-12-20' as occur_date,'123' as lowcarbon union all
select '1002' as id,'2021-12-12' as occur_date,'45' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'43' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'45' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'23' as lowcarbon union all
select '1002' as id,'2021-12-14' as occur_date,'45' as lowcarbon union all
select '1001' as id,'2021-12-14' as occur_date,'230' as lowcarbon union all
select '1002' as id,'2021-12-15' as occur_date,'45' as lowcarbon union all
select '1001' as id,'2021-12-15' as occur_date,'123' as lowcarbon union all
select '1003' as id,'2021-12-15' as occur_date,'123' as lowcarbon union all
select '1003' as id,'2021-12-16' as occur_date,'125' as lowcarbon union all
select '1003' as id,'2021-12-17' as occur_date,'126' as lowcarbon union all
select '1003' as id,'2021-12-18' as occur_date,'127' as lowcarbon union all
select '1003' as id,'2021-12-29' as occur_date,'127' as lowcarbon union all
select '1003' as id,'2021-12-19' as occur_date,'128' as lowcarbon ;
思路1(获取前置元素差值法) :
with t1 as (
    select
    id
    ,occur_date
    ,sum(lowcarbon) as lowcarbon

    from test1
    group by id
    ,occur_date
    having sum(lowcarbon) >= 3
),
t2 as (
select
    id
    ,occur_date
    ,lag(occur_date,1,occur_date) over(partition by id order by occur_date asc) as lag_occur_date
    ,datediff(occur_date
        ,lag(occur_date,1,occur_date) over(partition by id order by occur_date asc)
        ) as diff
    ,if(datediff(occur_date
        ,lag(occur_date,1,occur_date) over(partition by id order by occur_date asc)
        ) = 1,0,1) as contin_flag
from t1),
t3 as (
select
    id
    ,occur_date
    ,sum(contin_flag) over(partition by id order by occur_date asc) as contin_group
from t2)
select
    id
    ,contin_group
    ,count(occur_date) as contin_days
from t3
group by     id
    ,contin_group
having count(occur_date)  >= 3
;

id      contin_group    contin_days
1001    2       4
1001    3       4
1003    1       5
Time taken: 39.01 seconds, Fetched: 3 row(s)
View Code
思路2(获取序号差值法) :
with t1 as (
    select
    id
    ,occur_date
    ,sum(lowcarbon) as lowcarbon

    from test1
    group by id
    ,occur_date
    having sum(lowcarbon) >= 3
),
t2 as (
    select id
         , occur_date
         , row_number() over (partition by id order by occur_date asc) as rank
         , date_sub(
            occur_date
        , row_number() over (partition by id order by occur_date asc)
        )                                                             as diff_day
    from t1
)
select
id
,diff_day
,count(occur_date) as cont_days
from t2
group by id
,diff_day
having count(occur_date) >=3
;

id      diff_day        cont_days
1001    2021-12-10      4
1001    2021-12-14      4
1003    2021-12-14      5
Time taken: 23.384 seconds, Fetched: 3 row(s)
View Code

需求2 : 连续x天登录的用户( 找出连续7天登陆、连续30天登陆的用户、最大连续登陆天数)

-- 数据准备
-- DDL
create table test2 (
`user_id` string comment '用户id',
`login_date` string comment '登入日期',
`login_status` string comment '登入状态(1:成功,2:异常)')
 comment '用户登入记录表'
row format delimited fields terminated by '\t'
lines terminated by '\n' stored as orc;

-- DML
insert overwrite table test2
select '1001' as id,'2021-12-12' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-20' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-20' as occur_date,'0' as lowcarbon union all
select '1002' as id,'2021-12-12' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'1' as lowcarbon union all
select '1002' as id,'2021-12-14' as occur_date,'0' as lowcarbon union all
select '1001' as id,'2021-12-14' as occur_date,'1' as lowcarbon union all
select '1002' as id,'2021-12-15' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-15' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-15' as occur_date,'0' as lowcarbon union all
select '1003' as id,'2021-12-16' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-17' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-18' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-29' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-01' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-01' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-01' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-02' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-03' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-04' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-05' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-06' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-19' as occur_date,'1' as lowcarbon ;

-- 数据说明
-- 每天每天可能有多次登入记录
思路1 :
-- 需求 : 求每个人的最大连续登入天数,并判断是否有连续2天登入的记录
-- 获取序号 差值法
with t1 as (
    select
    user_id
    ,login_date
    from test2
    where login_status = '1'
    group by user_id
    ,login_date
)
select
user_id
,max(days) as max_login_days -- 最大登入天数
,max(case when days = 2 then 1
 else 0 end ) as if_2login_days -- 是否2天连续登入

from (
     select user_id
          , diff_days
          , count(distinct login_date) as days -- 连续登入天数
     from (
              select user_id
                   , login_date
                   , row_number() over (partition by user_id order by login_date asc)                       as rank
                   , date_sub(login_date,
                              row_number() over (partition by user_id order by login_date asc))             as diff_days
              from t1
          ) as t2
     group by user_id
            , diff_days
) as t3
group by user_id
;

user_id max_login_days  if_2login_days
1001    4       1
1002    1       0
1003    4       0
View Code

 



posted @ 2022-02-15 17:10  学而不思则罔!  阅读(286)  评论(0)    收藏  举报