hive_面试题 【排序问题 求视频类网站的top】

1. 数据准备
-- 1. 数据准备
create table gulivideo_ori(
   videoId string comment '视频唯一 id(String)', -- 11 位字符串
   uploader string comment '视频上传者(String)', -- 上传视频的用户名 String
   age int comment '视频年龄(int)', -- 视频在平台上的整数天
   category array<string> comment '视频类别(Array<String>)', -- 上传视频指定的视频分类
   length int comment '视频长度(Int)', -- 整形数字标识的视频长度
   views int comment '观看次数(Int)', -- 视频被浏览的次数
   rate float comment '视频评分(Double)', -- 满分5分
   ratings int comment '流量(Int)', -- 视频的流量,整型数字
   comments int comment '评论数(Int)', -- 一个视频的整数评论数
   relatedId array<string> comment '相关视频 id(Array<String>)' -- 相关视频的 id,最多 20 个 )
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;

create table gulivideo_user_ori(
   uploader string comment '上传者用户名', --
   videos int comment '上传视频数',
   friends int comment '上传视频数')
row format delimited
fields terminated by "\t"
stored as textfile;

create table gulivideo_orc(
   videoId string,
   uploader string,
   age int,
   category array<string>,
   length int,
   views int,
   rate float,
   ratings int,
   comments int,
   relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");

create table gulivideo_user_orc(
   uploader string,
   videos int,
   friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");

-- 上传数据到hive
load data local inpath "/root/video" into table gulivideo_ori;
load data local inpath "/root/user" into table gulivideo_user_ori;

-- 将text格式转成orc格式
set yarn.scheduler.maximum-allocation-mb=118784;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
set yarn.nodemanager.vmem-pmem-ratio=4.2;

insert overwrite table gulivideo_orc select * from gulivideo_ori;

insert overwrite table gulivideo_user_orc select * from gulivideo_user_ori;
View Code
2. 测试数据集连接:
链接: https://pan.baidu.com/s/1wa5fdr7O-R9XXjJun64yuQ 提取码: b59i
问题1: 统计视频观看数 Top10
-- 查询sql(方式1)
select
       videoid
       ,views
from gulivideo_orc
order by views desc
limit 10
;
-- 查询结果
videoid views
dMH0bHeiRNg     42513417
0XxI-hvPRRA     20282464
1dmVU08zVpA     16087899
RB-wUgnyGv0     15712924
QjA5faZF1A8     15256922
-_CSo1gOd48     13199833
49IDp76kjPw     11970018
tYnn51C3X_w     11823701
pv5zWaTEVkI     11672017
D2kJZOfq7zk     11184051

-- 查询sql(方式2)
select
t1.*
from (
select
       videoid
       ,views
       ,row_number() over(order by views desc) as rank
from gulivideo_orc
) as t1
where t1.rank <= 10
;

-- 查询结果
dMH0bHeiRNg     42513417        1
0XxI-hvPRRA     20282464        2
1dmVU08zVpA     16087899        3
RB-wUgnyGv0     15712924        4
QjA5faZF1A8     15256922        5
-_CSo1gOd48     13199833        6
49IDp76kjPw     11970018        7
tYnn51C3X_w     11823701        8
pv5zWaTEVkI     11672017        9
D2kJZOfq7zk     11184051        10
问题2 : 统计视频类别热度 Top10
说明 : 查询各个类别的视频量,取前10
注意 : 视频分类(数组存储,一个视频可能会属于多个类别)
-- 查询sql(方式1)
select
t1.*
from (
    select
            category_single
           ,count(distinct videoid) as cnt
    from gulivideo_orc
    lateral view explode(category) t2 as category_single
    group by category_single
) as t1
order by cnt desc
limit 10
;

-- 查询结果
category_single      cnt
Music                179049
Entertainment        127674
Comedy               87818
Animation            73293
Film                 73293
Sports               67329
Games                59817
Gadgets              59817
People               48890
Blogs                48890

Time taken: 31.571 seconds, Fetched: 10 row(s)
问题3 : 统计出视频观看数最高的 20 个视频的所属类别以及类别包含 Top20 视频的个数
说明 : Stage1 去播放量top20,top
-- 查询sql(方式1)
with t1 as (
    -- top20 视频
    select videoid
         , category
         , views
    from gulivideo_orc
    order by views desc
    limit 20
)
select
category_single
,count(distinct videoid) as cnt
from t1
lateral view explode(category) tmp as category_single
group by category_single
;

category_single cnt
Blogs           2
Comedy          6
Entertainment   6
Music           5
People          2
UNA             1

Time taken: 27.308 seconds, Fetched: 6 row(s)
问题4 : 统计视频观看数 Top50 所关联视频的所属类别排序 ???
-- 查询sql(方式1)
with t1 as (
    -- top20 视频
    select videoid
         , category
         , views
    from gulivideo_orc
    order by views desc
    limit 50
)
select
category_single
,count(distinct videoid) as cnt
from t1
lateral view explode(category) tmp as category_single
group by category_single
;


SELECT
  t6.category_name,
  t6.video_sum,
  rank() over(ORDER BY t6.video_sum DESC ) rk
FROM ( SELECT
  t5.category_name,
  COUNT(t5.relatedid_id) video_sum
FROM
( SELECT
  t4.relatedid_id,
  category_name
FROM
( SELECT
t2.relatedid_id ,
  t3.category
FROM
( SELECT
  relatedid_id
FROM
( SELECT
  videoId,
  views,
  relatedid
FROM
  gulivideo_orc
ORDER BY
  views
DESC
LIMIT 50
)t1
lateral VIEW explode(t1.relatedid) t1_tmp AS relatedid_id
)t2
JOIN
  gulivideo_orc t3
ON
 t2.relatedid_id = t3.videoId
) t4
lateral VIEW explode(t4.category) t4_tmp AS category_name
) t5
GROUP BY
  t5.category_name
ORDER BY
  video_sum
DESC ) t6
View Code
问题5 : 统计每个类别中的视频热度 Top10,以 Music 为例
-- 查询sql(方式1)
select
t1.*
from (
    select
            category_single
           ,videoid
           ,views
           ,row_number() over (partition by category_single order by views desc ) as rank
    from gulivideo_orc
    lateral view explode(category) t2 as category_single
) as t1
where rank <= 10
and category_single = 'Music'
;

-- 查询结果
category_single      videoid         views        rank
Music                QjA5faZF1A8     15256922        1
Music                tYnn51C3X_w     11823701        2
Music                pv5zWaTEVkI     11672017        3
Music                8bbTtPL1jRs     9579911         4
Music                UMf40daefsI     7533070         5
Music                -xEzGIuY7kw     6946033         6
Music                d6C0bNDqf3Y     6935578         7
Music                HSoVKUVOnfQ     6193057         8
Music                3URfWTEPmtE     5581171         9
Music                thtmaZnxk_0     5142238        10
Time taken: 17.235 seconds, Fetched: 10 row(s)
问题6 : 统计每个类别视频观看数 Top10
-- 执行sql
select
t1.*
from (
    select
            category_single
           ,videoid
           ,views
           ,row_number() over (partition by category_single order by views desc ) as rank
    from gulivideo_orc
    lateral view explode(category) t2 as category_single
) as t1
where rank <= 10;
问题7 : 统计上传视频最多的用户Top10以及他们上传的视频 观看次数在前 20 的视频
-- 操作sql
select
       t1.*
from (
select
       uploader
     , videoid
     , views
from gulivideo_orc
where uploader in (
    -- 上传视频数 top10 用户
    select
    uploader
    from  gulivideo_user_orc
    order by videos desc
    limit 10)
) as t1
order by views desc
limit 20
;

 



posted @ 2022-02-15 11:38  学而不思则罔!  阅读(138)  评论(0)    收藏  举报