hive_面试题 【排序问题 求视频类网站的top】
1. 数据准备

-- 1. 数据准备 create table gulivideo_ori( videoId string comment '视频唯一 id(String)', -- 11 位字符串 uploader string comment '视频上传者(String)', -- 上传视频的用户名 String age int comment '视频年龄(int)', -- 视频在平台上的整数天 category array<string> comment '视频类别(Array<String>)', -- 上传视频指定的视频分类 length int comment '视频长度(Int)', -- 整形数字标识的视频长度 views int comment '观看次数(Int)', -- 视频被浏览的次数 rate float comment '视频评分(Double)', -- 满分5分 ratings int comment '流量(Int)', -- 视频的流量,整型数字 comments int comment '评论数(Int)', -- 一个视频的整数评论数 relatedId array<string> comment '相关视频 id(Array<String>)' -- 相关视频的 id,最多 20 个 ) row format delimited fields terminated by "\t" collection items terminated by "&" stored as textfile; create table gulivideo_user_ori( uploader string comment '上传者用户名', -- videos int comment '上传视频数', friends int comment '上传视频数') row format delimited fields terminated by "\t" stored as textfile; create table gulivideo_orc( videoId string, uploader string, age int, category array<string>, length int, views int, rate float, ratings int, comments int, relatedId array<string>) stored as orc tblproperties("orc.compress"="SNAPPY"); create table gulivideo_user_orc( uploader string, videos int, friends int) row format delimited fields terminated by "\t" stored as orc tblproperties("orc.compress"="SNAPPY"); -- 上传数据到hive load data local inpath "/root/video" into table gulivideo_ori; load data local inpath "/root/user" into table gulivideo_user_ori; -- 将text格式转成orc格式 set yarn.scheduler.maximum-allocation-mb=118784; set mapreduce.map.memory.mb=4096; set mapreduce.reduce.memory.mb=4096; set yarn.nodemanager.vmem-pmem-ratio=4.2; insert overwrite table gulivideo_orc select * from gulivideo_ori; insert overwrite table gulivideo_user_orc select * from gulivideo_user_ori;
2. 测试数据集连接:
链接: https://pan.baidu.com/s/1wa5fdr7O-R9XXjJun64yuQ 提取码: b59i
问题1: 统计视频观看数 Top10
-- 查询sql(方式1) select videoid ,views from gulivideo_orc order by views desc limit 10 ; -- 查询结果 videoid views dMH0bHeiRNg 42513417 0XxI-hvPRRA 20282464 1dmVU08zVpA 16087899 RB-wUgnyGv0 15712924 QjA5faZF1A8 15256922 -_CSo1gOd48 13199833 49IDp76kjPw 11970018 tYnn51C3X_w 11823701 pv5zWaTEVkI 11672017 D2kJZOfq7zk 11184051 -- 查询sql(方式2) select t1.* from ( select videoid ,views ,row_number() over(order by views desc) as rank from gulivideo_orc ) as t1 where t1.rank <= 10 ; -- 查询结果 dMH0bHeiRNg 42513417 1 0XxI-hvPRRA 20282464 2 1dmVU08zVpA 16087899 3 RB-wUgnyGv0 15712924 4 QjA5faZF1A8 15256922 5 -_CSo1gOd48 13199833 6 49IDp76kjPw 11970018 7 tYnn51C3X_w 11823701 8 pv5zWaTEVkI 11672017 9 D2kJZOfq7zk 11184051 10
问题2 : 统计视频类别热度 Top10
说明 : 查询各个类别的视频量,取前10
注意 : 视频分类(数组存储,一个视频可能会属于多个类别)
-- 查询sql(方式1) select t1.* from ( select category_single ,count(distinct videoid) as cnt from gulivideo_orc lateral view explode(category) t2 as category_single group by category_single ) as t1 order by cnt desc limit 10 ; -- 查询结果 category_single cnt Music 179049 Entertainment 127674 Comedy 87818 Animation 73293 Film 73293 Sports 67329 Games 59817 Gadgets 59817 People 48890 Blogs 48890 Time taken: 31.571 seconds, Fetched: 10 row(s)
问题3 : 统计出视频观看数最高的 20 个视频的所属类别以及类别包含 Top20 视频的个数
说明 : Stage1 去播放量top20,top
-- 查询sql(方式1) with t1 as ( -- top20 视频 select videoid , category , views from gulivideo_orc order by views desc limit 20 ) select category_single ,count(distinct videoid) as cnt from t1 lateral view explode(category) tmp as category_single group by category_single ; category_single cnt Blogs 2 Comedy 6 Entertainment 6 Music 5 People 2 UNA 1 Time taken: 27.308 seconds, Fetched: 6 row(s)
问题4 : 统计视频观看数 Top50 所关联视频的所属类别排序 ???

-- 查询sql(方式1) with t1 as ( -- top20 视频 select videoid , category , views from gulivideo_orc order by views desc limit 50 ) select category_single ,count(distinct videoid) as cnt from t1 lateral view explode(category) tmp as category_single group by category_single ; SELECT t6.category_name, t6.video_sum, rank() over(ORDER BY t6.video_sum DESC ) rk FROM ( SELECT t5.category_name, COUNT(t5.relatedid_id) video_sum FROM ( SELECT t4.relatedid_id, category_name FROM ( SELECT t2.relatedid_id , t3.category FROM ( SELECT relatedid_id FROM ( SELECT videoId, views, relatedid FROM gulivideo_orc ORDER BY views DESC LIMIT 50 )t1 lateral VIEW explode(t1.relatedid) t1_tmp AS relatedid_id )t2 JOIN gulivideo_orc t3 ON t2.relatedid_id = t3.videoId ) t4 lateral VIEW explode(t4.category) t4_tmp AS category_name ) t5 GROUP BY t5.category_name ORDER BY video_sum DESC ) t6
问题5 : 统计每个类别中的视频热度 Top10,以 Music 为例
-- 查询sql(方式1) select t1.* from ( select category_single ,videoid ,views ,row_number() over (partition by category_single order by views desc ) as rank from gulivideo_orc lateral view explode(category) t2 as category_single ) as t1 where rank <= 10 and category_single = 'Music' ; -- 查询结果 category_single videoid views rank Music QjA5faZF1A8 15256922 1 Music tYnn51C3X_w 11823701 2 Music pv5zWaTEVkI 11672017 3 Music 8bbTtPL1jRs 9579911 4 Music UMf40daefsI 7533070 5 Music -xEzGIuY7kw 6946033 6 Music d6C0bNDqf3Y 6935578 7 Music HSoVKUVOnfQ 6193057 8 Music 3URfWTEPmtE 5581171 9 Music thtmaZnxk_0 5142238 10 Time taken: 17.235 seconds, Fetched: 10 row(s)
问题6 : 统计每个类别视频观看数 Top10
-- 执行sql select t1.* from ( select category_single ,videoid ,views ,row_number() over (partition by category_single order by views desc ) as rank from gulivideo_orc lateral view explode(category) t2 as category_single ) as t1 where rank <= 10;
问题7 : 统计上传视频最多的用户Top10以及他们上传的视频 观看次数在前 20 的视频
-- 操作sql select t1.* from ( select uploader , videoid , views from gulivideo_orc where uploader in ( -- 上传视频数 top10 用户 select uploader from gulivideo_user_orc order by videos desc limit 10) ) as t1 order by views desc limit 20 ;