Hive 练习题


准备数据
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
创建原始数据表:gulivideo_user_ori
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
创建 orc 存储格式带 snappy 压缩的表
gulivideo_orc
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
gulivideo_user_orc
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
向 ori 表插入数据
load data local inpath "/opt/module/data/video" into table gulivideo_ori;
load data local inpath "/opt/module/user" into table gulivideo_user_ori;
向 orc 表插入数据
insert into table gulivideo_orc select * from gulivideo_ori;
insert into table gulivideo_user_orc select * from gulivideo_user_ori;
业务分析
统计视频观看数 Top10
思路:使用 order by 按照 views 字段做一个全局排序即可,同时我们设置只显示前 10 条。
select
videoId,
views
from
gulivideo_orc
order by
views desc
limit 10;
统计视频类别热度 Top10
类别热度:指定的每个类别下的视频数
1.由于类别是数组,要把类别给炸出来
selelct
category_name
from
gulivideo_orc laternal view expolode(category) category_tmp as category;t1
-- 如果与原数据有关系就用上面的,没有关系就用下面的
select
explode(category) category_name
from
gulivideo_orc;t1
2.按照类别分组,求count 并按照count排序,取前十
select
categorty_name,
count(*) ct
from
t1
gruop by category_name
order by ct desc
limit 10;
3.最终SQL:
select
categorty_name,
count(*) ct
from
(select
explode(category) category_name
from
gulivideo_orc)t1
gruop by category_name
order by ct desc
limit 10;
统计出视频观看数最高的 20 个视频的所属类别以及类别包含Top20 视频的个数
- 求出视频观看数最高的20个视频所属类别
select
category,
viwes
from
gulivideo_orc
order by views desc
limit 20;t1
- 所属类别炸开
select
explode(category) category_ame
from
t1;t2
- 计算各个类别包含Top20 的个数
select
category_name
count(*) ct
from
t2
group by category_name
- 最终sql
select
category_name,
COUNT(*) ct
from
(SELECT
explode(category) category_name
from
(SELECT
category,
views
from gulivideo_orc
order by biews DESC
limit 20)t1)t2
group by category_name;
统计视频观看数 Top50 所关联视频的所属类别排序
- 求出视频观看数Top50的视频所关联视频的个数
SELECT
relatedId,
views
from gulivideo_orc
order by biews DESC
limit 50;t1
- 将关联视频炸开
SELECT
explode(relatedId) related_id
from
t1;t2
- join 原表,取出关联视频所属的类别(数组)
SELECT
g.category
from
t2
join gulivideo_orc g
on t2.related_id = g.videoId;t3
- 炸裂类别字段
SELECT
explode(category) category_name
from
t3;t4
- 按照类别分组,求count,并按照count排序
select
category_name,
COUNT(*)ct
from
t4
group by category_name
order by ct desc;
- 最终SQL
select
category_name,
COUNT(*)ct
from
( SELECT
explode(category) category_name
from
(select
g.category
FROM
(SELECT
explode(relatedId) related_id
from (SELECT
relatedId,
views
from gulivideo_orc
order by biews DESC
limit 50)t1)t2
join gulivideo_orc g
on t2.related_id = g.videoId)t3)t4
group by category_name
order by ct desc;
统计每个类别中的视频热度Top10,以Music为例子
- 重新建表(因为后面几个需要都有这个子查询表,所以不如直接建出表,直接使用)
create table gulivideo_orc_category(
videoId string,
uploader string,
age int,
category sring, -- 仅仅这个地方改变
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
insert into table gulivideo_orc_category
-- 查询不带类型
select
videoId,
uploader,
age,
category, -- 仅仅这个地方改变
length,
views,
rate,
ratings,
comments,
relatedId)
from gulivideo_orc lateral view explode(category) category_tmp as category_name;
- 最终SQL
select
videoId,
views
from gulivideo_orc_category
where category=''Music
order bvy views desc
limlt 10;
统计每个类别视频观看数 Top3
- 求出每个类别视频观看数排名
select
category,
videoId,
views,
rank()over(partition by category order by views desc) rk
from gulivideo_orc_category;t1
- 取出前三名
select
category,
videoId,
views
from
t1
where tk <=3;
- 最终SQL
select
category,
videoId,
views
from
(select
category,
videoId,
views,
rank()over(partition by category order by views desc) rk
from gulivideo_orc_category)t1
where tk <=3;
统计上传视频最多的用户 Top10以及他们上传的视频观看次数在前20的视频
- 求出上传视频最多的20个用户
SELECT
uploader
from gulivideo_user_orc
order by videos DESC
LIMIT 10;t1
- 跟视频表join,取出前十用户所上传的所有视频
select
t1.uploader,
videoId,
views
FROM
t1
join gulivideo_orc g
ON t1.uploader = g.uploader;t2
- 根据观看次数,对用户上传的视频进行排名
SELECT
uploader,
videoId,
views,
rank()over(PARTITION BY uploader order BY views desc)rk
FROM
t2;t3
- 取出每个上传者观看数前20的视频
SELECT
uploader,
videoId,
views
from
t3
where rk<=20;
- 最终sql
SELECT
uploader,
videoId,
views
rank()over(PARTITION BY uploader order BY views desc)rk
FROM
(
select
t1.uploader,
videoId,
views
from
(select
uploader
from gulivideo_user_orc
order by videos DESC
limit 10)t1
join gulivideo_orc g
ON t1.uploader = g.uploader)t2)t3
where rk<=20;

浙公网安备 33010602011771号