1. 数据准备

-- 1. 数据准备
create table gulivideo_ori(
videoId string comment '视频唯一 id(String)', -- 11 位字符串
uploader string comment '视频上传者(String)', -- 上传视频的用户名 String
age int comment '视频年龄(int)', -- 视频在平台上的整数天
category array<string> comment '视频类别(Array<String>)', -- 上传视频指定的视频分类
length int comment '视频长度(Int)', -- 整形数字标识的视频长度
views int comment '观看次数(Int)', -- 视频被浏览的次数
rate float comment '视频评分(Double)', -- 满分5分
ratings int comment '流量(Int)', -- 视频的流量,整型数字
comments int comment '评论数(Int)', -- 一个视频的整数评论数
relatedId array<string> comment '相关视频 id(Array<String>)' -- 相关视频的 id,最多 20 个 )
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
create table gulivideo_user_ori(
uploader string comment '上传者用户名', --
videos int comment '上传视频数',
friends int comment '上传视频数')
row format delimited
fields terminated by "\t"
stored as textfile;
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
-- 上传数据到hive
load data local inpath "/root/video" into table gulivideo_ori;
load data local inpath "/root/user" into table gulivideo_user_ori;
-- 将text格式转成orc格式
set yarn.scheduler.maximum-allocation-mb=118784;
set mapreduce.map.memory.mb=4096;
set mapreduce.reduce.memory.mb=4096;
set yarn.nodemanager.vmem-pmem-ratio=4.2;
insert overwrite table gulivideo_orc select * from gulivideo_ori;
insert overwrite table gulivideo_user_orc select * from gulivideo_user_ori;
View Code
2. 测试数据集连接:
链接: https://pan.baidu.com/s/1wa5fdr7O-R9XXjJun64yuQ 提取码: b59i
问题1: 统计视频观看数 Top10
-- 查询sql(方式1)
select
videoid
,views
from gulivideo_orc
order by views desc
limit 10
;
-- 查询结果
videoid views
dMH0bHeiRNg 42513417
0XxI-hvPRRA 20282464
1dmVU08zVpA 16087899
RB-wUgnyGv0 15712924
QjA5faZF1A8 15256922
-_CSo1gOd48 13199833
49IDp76kjPw 11970018
tYnn51C3X_w 11823701
pv5zWaTEVkI 11672017
D2kJZOfq7zk 11184051
-- 查询sql(方式2)
select
t1.*
from (
select
videoid
,views
,row_number() over(order by views desc) as rank
from gulivideo_orc
) as t1
where t1.rank <= 10
;
-- 查询结果
dMH0bHeiRNg 42513417 1
0XxI-hvPRRA 20282464 2
1dmVU08zVpA 16087899 3
RB-wUgnyGv0 15712924 4
QjA5faZF1A8 15256922 5
-_CSo1gOd48 13199833 6
49IDp76kjPw 11970018 7
tYnn51C3X_w 11823701 8
pv5zWaTEVkI 11672017 9
D2kJZOfq7zk 11184051 10
问题2 : 统计视频类别热度 Top10
说明 : 查询各个类别的视频量,取前10
注意 : 视频分类(数组存储,一个视频可能会属于多个类别)
-- 查询sql(方式1)
select
t1.*
from (
select
category_single
,count(distinct videoid) as cnt
from gulivideo_orc
lateral view explode(category) t2 as category_single
group by category_single
) as t1
order by cnt desc
limit 10
;
-- 查询结果
category_single cnt
Music 179049
Entertainment 127674
Comedy 87818
Animation 73293
Film 73293
Sports 67329
Games 59817
Gadgets 59817
People 48890
Blogs 48890
Time taken: 31.571 seconds, Fetched: 10 row(s)
问题3 : 统计出视频观看数最高的 20 个视频的所属类别以及类别包含 Top20 视频的个数
说明 : Stage1 去播放量top20,top
-- 查询sql(方式1)
with t1 as (
-- top20 视频
select videoid
, category
, views
from gulivideo_orc
order by views desc
limit 20
)
select
category_single
,count(distinct videoid) as cnt
from t1
lateral view explode(category) tmp as category_single
group by category_single
;
category_single cnt
Blogs 2
Comedy 6
Entertainment 6
Music 5
People 2
UNA 1
Time taken: 27.308 seconds, Fetched: 6 row(s)
问题4 : 统计视频观看数 Top50 所关联视频的所属类别排序 ???

-- 查询sql(方式1)
with t1 as (
-- top20 视频
select videoid
, category
, views
from gulivideo_orc
order by views desc
limit 50
)
select
category_single
,count(distinct videoid) as cnt
from t1
lateral view explode(category) tmp as category_single
group by category_single
;
SELECT
t6.category_name,
t6.video_sum,
rank() over(ORDER BY t6.video_sum DESC ) rk
FROM ( SELECT
t5.category_name,
COUNT(t5.relatedid_id) video_sum
FROM
( SELECT
t4.relatedid_id,
category_name
FROM
( SELECT
t2.relatedid_id ,
t3.category
FROM
( SELECT
relatedid_id
FROM
( SELECT
videoId,
views,
relatedid
FROM
gulivideo_orc
ORDER BY
views
DESC
LIMIT 50
)t1
lateral VIEW explode(t1.relatedid) t1_tmp AS relatedid_id
)t2
JOIN
gulivideo_orc t3
ON
t2.relatedid_id = t3.videoId
) t4
lateral VIEW explode(t4.category) t4_tmp AS category_name
) t5
GROUP BY
t5.category_name
ORDER BY
video_sum
DESC ) t6
View Code
问题5 : 统计每个类别中的视频热度 Top10,以 Music 为例
-- 查询sql(方式1)
select
t1.*
from (
select
category_single
,videoid
,views
,row_number() over (partition by category_single order by views desc ) as rank
from gulivideo_orc
lateral view explode(category) t2 as category_single
) as t1
where rank <= 10
and category_single = 'Music'
;
-- 查询结果
category_single videoid views rank
Music QjA5faZF1A8 15256922 1
Music tYnn51C3X_w 11823701 2
Music pv5zWaTEVkI 11672017 3
Music 8bbTtPL1jRs 9579911 4
Music UMf40daefsI 7533070 5
Music -xEzGIuY7kw 6946033 6
Music d6C0bNDqf3Y 6935578 7
Music HSoVKUVOnfQ 6193057 8
Music 3URfWTEPmtE 5581171 9
Music thtmaZnxk_0 5142238 10
Time taken: 17.235 seconds, Fetched: 10 row(s)
问题6 : 统计每个类别视频观看数 Top10
-- 执行sql
select
t1.*
from (
select
category_single
,videoid
,views
,row_number() over (partition by category_single order by views desc ) as rank
from gulivideo_orc
lateral view explode(category) t2 as category_single
) as t1
where rank <= 10;
问题7 : 统计上传视频最多的用户Top10以及他们上传的视频 观看次数在前 20 的视频
-- 操作sql
select
t1.*
from (
select
uploader
, videoid
, views
from gulivideo_orc
where uploader in (
-- 上传视频数 top10 用户
select
uploader
from gulivideo_user_orc
order by videos desc
limit 10)
) as t1
order by views desc
limit 20
;