hive编程面试题1

第1题

表结构：uid,subject_id,score

求：找出所有科目成绩都大于某一学科平均成绩的学生

数据集如下

1001    01  90
1001    02  90
1001    03  90
1002    01  85
1002    02  85
1002    03  70
1003    01  70
1003    02  70
1003    03  85

1）建表语句

create table score(

  uid string,

  subject_id string,

  score int)

row format delimited fields terminated by '\t';

2）求出每个学科平均成绩

select

  uid,

  score,

  avg(score) over(partition by subject_id) avg_score

from

  score;t1

3）根据是否大于平均成绩记录flag，大于则记为0否则记为1

select

  uid,

  if(score>avg_score,0,1) flag

from

  t1;t2

4）根据学生id进行分组统计flag的和，和为0则是所有学科都大于平均成绩

select

  uid

from

  t2

group by

  uid

having

  sum(flag)=0;

5）最终SQL

select

  uid

from

  (select

  uid,

  if(score>avg_score,0,1) flag

from

  (select

  uid,

  score,
  avg(score) over(partition by subject_id) avg_score
from

  score)t1)t2

group by

  uid

having

  sum(flag)=0;

第2题

我们有如下的用户访问数据

userId	visitDate	visitCount
u01	2017/1/21	5
u02	2017/1/23	6
u03	2017/1/22	8
u04	2017/1/20	3
u01	2017/1/23	6
u01	2017/2/21	8
U02	2017/1/23	6
U01	2017/2/22	4

要求使用SQL统计出每个用户的累积访问次数，如下表所示：

用户id	月份	小计	累积
u01	2017-01	11	11
u01	2017-02	12	23
u02	2017-01	12	12
u03	2017-01	8	8
u04	2017-01	3	3

数据集

u01   2017/1/21    5
u02   2017/1/23    6
u03   2017/1/22    8
u04   2017/1/20    3
u01   2017/1/23    6
u01   2017/2/21    8
u02   2017/1/23    6
u01   2017/2/22    4

1）创建表

create table action
(userId string,
visitDate string,
visitCount int) 
row format delimited fields terminated by "\t";

2）修改数据格式

select

   userId,

   date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
   visitCount
from

   action;t1

3）计算每人单月访问量

select

  userId,

  mn,

  sum(visitCount) mn_count

from

  t1

group by 

userId,mn;t2

4）按月累计访问量

select

  userId,

  mn,

  mn_count,

  sum(mn_count) over(partition by userId order by mn)

from t2;

5）最终SQL

select

  userId,

  mn,

  mn_count,

  sum(mn_count) over(partition by userId order by mn)

from 

(  select

   userId,

    mn,

    sum(visitCount) mn_count

  from

     (select

       userId,

       date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,

       visitCount

     from

       action)t1

group by userId,mn)t2;

第3题

有50W个京东店铺，每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志，访问日志存储的表名为Visit，访客的用户id为user_id，被访问的店铺名称为shop，请统计：

1）每个店铺的UV（访客数）

2）每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数

数据集

u1  a
u2  b
u1  b
u1  a
u3  c
u4  b
u1  a
u2  c
u5  b
u4  b
u6  c
u2  c
u1  b
u2  a
u2  a
u3  a
u5  a
u5  a
u5  a

1）建表

create table visit(user_id string,shop string) row format delimited fields terminated by '\t';

2）每个店铺的UV（访客数）

select shop,count(distinct user_id) from visit group by shop;

3）每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数

（1）查询每个店铺被每个用户访问次数

select shop,user_id,count(*) ct

from visit

group by shop,user_id;t1

（2）计算每个店铺被用户访问次数排名

select shop,user_id,ct,rank() over(partition by shop order by ct) rk

from t1;t2

（3）取每个店铺排名前3的

select shop,user_id,ct

from t2

where rk<=3;

（4）最终SQL

select 

shop,

user_id,

ct

from

(select 

shop,

user_id,

ct,

rank() over(partition by shop order by ct) rk

from 

(select 

shop,

user_id,

count(*) ct

from visit

group by 

shop,

user_id)t1

)t2

where rk<=3;

第4题

已知一个表STG.ORDER，有如下字段:Date，Order_id，User_id，amount。请给出sql进行统计:数据样例:

2017-01-01,10029028,1000003251,33.57。

1）给出 2017年每个月的订单数、用户数、总成交金额。

2）给出2017年11月的新客数(指在11月才有第一笔订单)

建表

create table order_tab(dt string,order_id string,user_id string,amount decimal(10,2)) row format delimited fields terminated by '\t';

1）给出 2017年每个月的订单数、用户数、总成交金额。

select

  date_format(dt,'yyyy-MM'),

  count(order_id),

  count(distinct user_id),

  sum(amount)

from

  order_tab

where

  date_format(dt,'yyyy')='2017'

group by

  date_format(dt,'yyyy-MM');

2）给出2017年11月的新客数(指在11月才有第一笔订单)

select

  count(user_id)

from

  order_tab

group by

  user_id

having

  date_format(min(dt),'yyyy-MM')='2017-11';

第5题

有日志如下，请写出代码求得所有用户和活跃用户的总数及平均年龄。（活跃用户指连续两天都有访问记录的用户）日期用户年龄

数据集

2019-02-11,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19

1）建表

create table user_age(dt string,user_id string,age int)row format delimited fields terminated by ',';

2）按照日期以及用户分组，按照日期排序并给出排名

select

  dt,

  user_id,

  min(age) age,

  rank() over(partition by user_id order by dt) rk

from

  user_age

group by

  dt,user_id;t1

3）计算日期及排名的差值

select

  user_id,

  age,

  date_sub(dt,rk) flag

from

  t1;t2

4）过滤出差值大于等于2的，即为连续两天活跃的用户

select

  user_id,

  min(age) age

from

  t2

group by

  user_id,flag

having

  count(*)>=2;t3

5）对数据进行去重处理（一个用户可以在两个不同的时间点连续登录），例如：a用户在1月10号1月11号以及1月20号和1月21号4天登录。

select

  user_id,

  min(age) age

from

  t3

group by

  user_id;t4

6）计算活跃用户（两天连续有访问）的人数以及平均年龄

select

  count(*) ct,

  cast(sum(age)/count(*) as decimal(10,2))

from t4;

7）对全量数据集进行按照用户去重

select

  user_id,

  min(age) age 

from

  user_age 

group by 

  user_id;t5

8）计算所有用户的数量以及平均年龄

select

  count(*) user_count,

  cast((sum(age)/count(*)) as decimal(10,1)) 

from 

  t5;

9）将第5步以及第7步两个数据集进行union all操作

select

  0 user_total_count,

  0 user_total_avg_age,

  count(*) twice_count,

  cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_age

from 

(

  select

  user_id,

  min(age) age

from

  (select

  user_id,

  min(age) age

from

  (

  select

  user_id,

  age,

  date_sub(dt,rk) flag

from

  (

  select

•    dt,

•    user_id,

•    min(age) age,

•    rank() over(partition by user_id order by dt) rk

  from

•    user_age

  group by

•    dt,user_id

  )t1

  )t2

group by

  user_id,flag

having

  count(*)>=2)t3

group by

  user_id 

)t4

 

union all

 

select

  count(*) user_total_count,

  cast((sum(age)/count(*)) as decimal(10,1)),

  0 twice_count,

  0 twice_count_avg_age

from 

  (

   select

•     user_id,

•     min(age) age 

   from 

•     user_age 

   group by 

•     user_id

  )t5;t6

10）求和并拼接为最终SQL

select 

  sum(user_total_count),

  sum(user_total_avg_age),

  sum(twice_count),

  sum(twice_count_avg_age)

from 

(select

  0 user_total_count,

  0 user_total_avg_age,

  count(*) twice_count,

  cast(sum(age)/count(*) as decimal(10,2)) twice_count_avg_age

from 

(

  select

  user_id,

  min(age) age

from

  (select

  user_id,

  min(age) age

from

  (

  select

  user_id,

  age,

  date_sub(dt,rk) flag

from

  (

  select

•    dt,

•    user_id,

•    min(age) age,

•    rank() over(partition by user_id order by dt) rk

  from

•    user_age

  group by

•    dt,user_id

  )t1

  )t2

group by

  user_id,flag

having

  count(*)>=2)t3

group by

  user_id 

)t4

 

union all

 

select

  count(*) user_total_count,

  cast((sum(age)/count(*)) as decimal(10,1)),

  0 twice_count,

  0 twice_count_avg_age

from 

  (

   select

•     user_id,

•     min(age) age 

   from 

•     user_age 

   group by 

•     user_id

  )t5)t6;

posted @ 2021-10-08 16:22 碧水斜茶阅读(143) 评论(0) 收藏举报

刷新页面返回顶部

hive编程面试题1

第1题

第2题

第3题

第4题

第5题

公告