思考1: 怎样判断 有序序列 连续?(获取前置元素差值法、获取序号差值法)
方法1 :(获取前置元素差值法)
1. 获取当前元素的上一个元素(没有时,为当前元素本身)
3 3
5 3
6 5
7 6
9 7
10 9
11 10
2. 当前元素与前一元素求差(差值为1的表示:队中或队尾 差值!=1表示:队首)
3 3 0
5 3 2
6 5 1
7 6 1
9 7 2
10 9 1
11 10 1
3. 状态值 : 队首用1 表示,队尾用 0 表示
3 3 0 1
5 3 2 1
6 5 1 0
7 6 1 0
9 7 2 1
10 9 1 0
11 10 1 0
4. 对状态值进行累加(累加值相同的表示连续序列)
3 3 0 1 1
5 3 2 1 2
6 5 1 0 2
7 6 1 0 2
9 7 2 1 3
10 9 1 0 3
11 10 1 0 3
方法2 :(获取序号差值法)
1. 对序列进行排序,获取序号
3 1
5 2
6 3
7 4
9 5
10 6
11 7
2. 当前元素与序号做差(差值相同的表示为同一连续序列)
3 1 2
5 2 3
6 3 3
7 4 3
9 5 4
10 6 4
11 7 4
需求1 : 求支付宝-蚂蚁森林 找出连续3天及以上减少碳排放量在100以上的用户
-- 数据准备
-- DDL
create table test1 (
`id` string comment '用户id',
`occur_date` string comment '发生日期',
`lowcarbon` string comment '减少碳排放量')
comment '减少碳排放量转换表'
row format delimited fields terminated by '\t'
lines terminated by '\n' stored as orc;
-- 数据说明
-- 支付宝-蚂蚁森林 记录运动量转换成 减少碳排放量
-- 每点击一次 会收获一次 减少碳排放量,并生成一条记录
-- 每日每人可多次点击,生成多条记录
-- DML
insert overwrite table test1
select '1001' as id,'2021-12-12' as occur_date,'123' as lowcarbon union all
select '1001' as id,'2021-12-20' as occur_date,'123' as lowcarbon union all
select '1002' as id,'2021-12-12' as occur_date,'45' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'43' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'45' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'23' as lowcarbon union all
select '1002' as id,'2021-12-14' as occur_date,'45' as lowcarbon union all
select '1001' as id,'2021-12-14' as occur_date,'230' as lowcarbon union all
select '1002' as id,'2021-12-15' as occur_date,'45' as lowcarbon union all
select '1001' as id,'2021-12-15' as occur_date,'123' as lowcarbon union all
select '1003' as id,'2021-12-15' as occur_date,'123' as lowcarbon union all
select '1003' as id,'2021-12-16' as occur_date,'125' as lowcarbon union all
select '1003' as id,'2021-12-17' as occur_date,'126' as lowcarbon union all
select '1003' as id,'2021-12-18' as occur_date,'127' as lowcarbon union all
select '1003' as id,'2021-12-29' as occur_date,'127' as lowcarbon union all
select '1003' as id,'2021-12-19' as occur_date,'128' as lowcarbon ;
思路1(获取前置元素差值法) :

with t1 as (
select
id
,occur_date
,sum(lowcarbon) as lowcarbon
from test1
group by id
,occur_date
having sum(lowcarbon) >= 3
),
t2 as (
select
id
,occur_date
,lag(occur_date,1,occur_date) over(partition by id order by occur_date asc) as lag_occur_date
,datediff(occur_date
,lag(occur_date,1,occur_date) over(partition by id order by occur_date asc)
) as diff
,if(datediff(occur_date
,lag(occur_date,1,occur_date) over(partition by id order by occur_date asc)
) = 1,0,1) as contin_flag
from t1),
t3 as (
select
id
,occur_date
,sum(contin_flag) over(partition by id order by occur_date asc) as contin_group
from t2)
select
id
,contin_group
,count(occur_date) as contin_days
from t3
group by id
,contin_group
having count(occur_date) >= 3
;
id contin_group contin_days
1001 2 4
1001 3 4
1003 1 5
Time taken: 39.01 seconds, Fetched: 3 row(s)
View Code
思路2(获取序号差值法) :

with t1 as (
select
id
,occur_date
,sum(lowcarbon) as lowcarbon
from test1
group by id
,occur_date
having sum(lowcarbon) >= 3
),
t2 as (
select id
, occur_date
, row_number() over (partition by id order by occur_date asc) as rank
, date_sub(
occur_date
, row_number() over (partition by id order by occur_date asc)
) as diff_day
from t1
)
select
id
,diff_day
,count(occur_date) as cont_days
from t2
group by id
,diff_day
having count(occur_date) >=3
;
id diff_day cont_days
1001 2021-12-10 4
1001 2021-12-14 4
1003 2021-12-14 5
Time taken: 23.384 seconds, Fetched: 3 row(s)
View Code
需求2 : 连续x天登录的用户( 找出连续7天登陆、连续30天登陆的用户、最大连续登陆天数)
-- 数据准备
-- DDL
create table test2 (
`user_id` string comment '用户id',
`login_date` string comment '登入日期',
`login_status` string comment '登入状态(1:成功,2:异常)')
comment '用户登入记录表'
row format delimited fields terminated by '\t'
lines terminated by '\n' stored as orc;
-- DML
insert overwrite table test2
select '1001' as id,'2021-12-12' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-20' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-20' as occur_date,'0' as lowcarbon union all
select '1002' as id,'2021-12-12' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-13' as occur_date,'1' as lowcarbon union all
select '1002' as id,'2021-12-14' as occur_date,'0' as lowcarbon union all
select '1001' as id,'2021-12-14' as occur_date,'1' as lowcarbon union all
select '1002' as id,'2021-12-15' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2021-12-15' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-15' as occur_date,'0' as lowcarbon union all
select '1003' as id,'2021-12-16' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-17' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-18' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-29' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-01' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-01' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-01' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-02' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-03' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-04' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-05' as occur_date,'1' as lowcarbon union all
select '1001' as id,'2022-01-06' as occur_date,'1' as lowcarbon union all
select '1003' as id,'2021-12-19' as occur_date,'1' as lowcarbon ;
-- 数据说明
-- 每天每天可能有多次登入记录
思路1 :

-- 需求 : 求每个人的最大连续登入天数,并判断是否有连续2天登入的记录
-- 获取序号 差值法
with t1 as (
select
user_id
,login_date
from test2
where login_status = '1'
group by user_id
,login_date
)
select
user_id
,max(days) as max_login_days -- 最大登入天数
,max(case when days = 2 then 1
else 0 end ) as if_2login_days -- 是否2天连续登入
from (
select user_id
, diff_days
, count(distinct login_date) as days -- 连续登入天数
from (
select user_id
, login_date
, row_number() over (partition by user_id order by login_date asc) as rank
, date_sub(login_date,
row_number() over (partition by user_id order by login_date asc)) as diff_days
from t1
) as t2
group by user_id
, diff_days
) as t3
group by user_id
;
user_id max_login_days if_2login_days
1001 4 1
1002 1 0
1003 4 0
View Code