Hive 练习题

image
image
准备数据

create table gulivideo_ori(
 videoId string,
 uploader string,
 age int,
 category array<string>,
 length int,
 views int,
 rate float,
 ratings int,
 comments int,
 relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;

创建原始数据表:gulivideo_user_ori

create table gulivideo_user_ori(
 uploader string,
 videos int,
 friends int)
row format delimited 
fields terminated by "\t" 
stored as textfile;

创建 orc 存储格式带 snappy 压缩的表
gulivideo_orc

create table gulivideo_orc(
 videoId string, 
 uploader string, 
 age int, 
 category array<string>, 
 length int, 
 views int, 
 rate float, 
 ratings int, 
 comments int,
 relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");

gulivideo_user_orc

create table gulivideo_user_orc(
 uploader string,
 videos int,
 friends int)
row format delimited 
fields terminated by "\t" 
stored as orc
tblproperties("orc.compress"="SNAPPY");

向 ori 表插入数据

load data local inpath "/opt/module/data/video" into table gulivideo_ori;
load data local inpath "/opt/module/user" into table gulivideo_user_ori;

向 orc 表插入数据

insert into table gulivideo_orc select * from gulivideo_ori;
insert into table gulivideo_user_orc select * from gulivideo_user_ori;

业务分析

统计视频观看数 Top10

思路:使用 order by 按照 views 字段做一个全局排序即可,同时我们设置只显示前 10 条。

select
    videoId,
    views
from
    gulivideo_orc
order by 
    views desc
limit 10;

统计视频类别热度 Top10

类别热度:指定的每个类别下的视频数
1.由于类别是数组,要把类别给炸出来

selelct
    category_name
from
    gulivideo_orc laternal view expolode(category) category_tmp as category;t1

-- 如果与原数据有关系就用上面的,没有关系就用下面的

select
     explode(category) category_name
from
     gulivideo_orc;t1

2.按照类别分组,求count 并按照count排序,取前十

select 
    categorty_name,
	count(*) ct
from
   t1
gruop by category_name
order by ct desc
limit 10;

3.最终SQL:

select
    categorty_name,
	count(*) ct
from
   (select
       explode(category) category_name
    from
       gulivideo_orc)t1
gruop by category_name
order by ct desc
limit 10;

统计出视频观看数最高的 20 个视频的所属类别以及类别包含Top20 视频的个数

  1. 求出视频观看数最高的20个视频所属类别
select
     category,
	 viwes
from
    gulivideo_orc
order by views desc
limit 20;t1
  1. 所属类别炸开
select
    explode(category) category_ame
from
    t1;t2
  1. 计算各个类别包含Top20 的个数
select
    category_name
	count(*) ct
from
   t2
   group by category_name
  1. 最终sql
select
    category_name,
	COUNT(*) ct
from
    (SELECT
    explode(category) category_name
     from
	(SELECT
	   category,
		views
		from gulivideo_orc
		order by biews DESC
		limit 20)t1)t2
	group by category_name;

统计视频观看数 Top50 所关联视频的所属类别排序

  1. 求出视频观看数Top50的视频所关联视频的个数
SELECT 
   relatedId,
   views
from gulivideo_orc
order by biews DESC
limit 50;t1
  1. 将关联视频炸开
SELECT 
   explode(relatedId) related_id
from 
   t1;t2
  1. join 原表,取出关联视频所属的类别(数组)
SELECT 
     g.category
from 
     t2
join gulivideo_orc g
on t2.related_id = g.videoId;t3
  1. 炸裂类别字段
SELECT
   explode(category) category_name
from
    t3;t4
  1. 按照类别分组,求count,并按照count排序
select 
    category_name,
	COUNT(*)ct
from
    t4
group by category_name
order by ct desc;
  1. 最终SQL
select 
    category_name,
	COUNT(*)ct
from 
    ( SELECT
   explode(category) category_name
    from
	(select
	   g.category
	 FROM
	   (SELECT
		   explode(relatedId) related_id
		from (SELECT
		   relatedId,
		   views
		from gulivideo_orc
		order by biews DESC
		limit 50)t1)t2
	join gulivideo_orc g
	on t2.related_id = g.videoId)t3)t4
group by category_name
order by ct desc;

统计每个类别中的视频热度Top10,以Music为例子

  1. 重新建表(因为后面几个需要都有这个子查询表,所以不如直接建出表,直接使用)
create table gulivideo_orc_category(
 videoId string,
 uploader string,
 age int, 
 category sring,   -- 仅仅这个地方改变
 length int, 
 views int, 
 rate float, 
 ratings int, 
 comments int,
 relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
insert into table gulivideo_orc_category
-- 查询不带类型
select
 videoId,
 uploader,
 age,
 category,   -- 仅仅这个地方改变
 length,
 views,
 rate,
 ratings,
 comments,
 relatedId)
from gulivideo_orc lateral view explode(category) category_tmp as category_name;
  1. 最终SQL
select
     videoId,
	 views
from gulivideo_orc_category
where category=''Music
order bvy views desc
limlt 10;

统计每个类别视频观看数 Top3

  1. 求出每个类别视频观看数排名
select 
     category,
	 videoId,
	 views,
	 rank()over(partition by category order by views desc) rk
from gulivideo_orc_category;t1
  1. 取出前三名
select
    category,
	videoId,
	views
from
    t1
where tk <=3;
  1. 最终SQL
select
    category,
	videoId,
	views
from
    (select
     category,
	 videoId,
	 views,
	 rank()over(partition by category order by views desc) rk
      from gulivideo_orc_category)t1
where tk <=3;

统计上传视频最多的用户 Top10以及他们上传的视频观看次数在前20的视频

  1. 求出上传视频最多的20个用户
 SELECT
   uploader
 from gulivideo_user_orc
 order by videos DESC
 LIMIT 10;t1
  1. 跟视频表join,取出前十用户所上传的所有视频
 select
     t1.uploader,
	 videoId,
	 views
 FROM
    t1
join gulivideo_orc g
ON t1.uploader = g.uploader;t2
  1. 根据观看次数,对用户上传的视频进行排名
SELECT
     uploader,
	 videoId,
	 views,
	 rank()over(PARTITION BY uploader order BY views desc)rk
FROM
    t2;t3
  1. 取出每个上传者观看数前20的视频
SELECT
    uploader,
	videoId,
	views
from 
    t3
where rk<=20;
  1. 最终sql
SELECT
    uploader,
	videoId,
	views
	rank()over(PARTITION BY uploader order BY views desc)rk
FROM 
    (
	 select
		 t1.uploader,
		 videoId,
		 views
         from
	        (select
		      uploader
		 from gulivideo_user_orc
		 order by videos DESC
		 limit 10)t1
     join gulivideo_orc g
     ON t1.uploader = g.uploader)t2)t3
where rk<=20;
posted @ 2022-11-13 11:54  去揽一池星河  阅读(106)  评论(0)    收藏  举报