bitmap
--------------
位图/位映射。
5
hive
--------------
分区表
udtf函数
wordcount
lateral view //和udtf配合使用。
order by //数据倾斜
sort by //reduce内排序
distribute by //分区
cluster by //sort by + distribute by
hive
--------------
--显式表头
hive>set hive.cli.print.header=true ;
hive分析函数
--------------
1.准备数据
create table emp
(
id int ,
empno string ,
age int ,
salary float ,
deptno int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
2.加载数据
1,tom1,28,3000,1
2,tom2,29,5000,2
3,tom3,30,5400,1
4,tom4,28,6600,3
5,tom5,27,8000,2
6,tom6,35,10000,3
3.加载数据
load data local inpath '/home/centos/emp.txt' into table emp ;
hive分区函数应用
-----------------
0.简介
hive分析函数可以连接每条记录形成数据集,字段甚至可以不是分组字段,使用一次mr
完成聚合查询。
常规的sql下的分组聚合很诸多限制,其中select字段必须是分组字段,有时需要多次mr.
select deptno , max(salary) from emp group by deptno ;
1.分析函数
-- over , 只是分区
SELECT id, empno, salary ,deptno ,max(salary) OVER (PARTITION BY deptno) AS max from emp ;
-- OVER + ORDER BY , 分区并在分区内排序
SELECT empno, deptno, salary ,SUM(salary) OVER(PARTITION BY deptno ORDER BY salary) AS t1 from emp;
-- OVER ... rows unbounded preceding 基于前导所有行的汇总操作。
SELECT empno, deptno, salary , SUM(salary) OVER(ORDER BY deptno, empno rows unbounded preceding) AS t3 from emp ;
-- RANK, 排名操作 ,计算每个部门内按照工资的降序进行排名(有缝,并列情况存在缝隙)绝对排名。
SELECT empno, deptno, salary, RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
-- dense_rank()密度排名,无缝。绝对排名。
SELECT empno, deptno, salary, Dense_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
-- percent_rank()百分比排名,相对排名.
SELECT empno, deptno, salary, percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
--NTILE(n) ,分桶操纵,将数据均匀分散到各个桶中。
SELECT empno, deptno, salary , NTILE(4) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
-- lead()前导操作,查询从当前开始,后续第几行的操作。
SELECT empno, deptno, salary, LEAD(salary, 2) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
--lag,从当前行计数,访问之前的第几行salary,如果超过窗口范围返回null。
SELECT empno, deptno, salary, lag(salary, 1) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
--first_value()
SELECT empno, deptno, salary, first_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
--last_value()
SELECT empno, deptno, salary, last_value(salary) OVER(PARTITION BY deptno ORDER BY salary desc) AS t1 from emp ;
--使用range开窗函数 RANGE BETWEEN ... AND ...,在分区内在划分记录范围。
SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ;
SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS t1 from emp ;
--RANGE : 对值的+/-.
SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ;
SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN UNBOUNDED PRECEDING AND current row) AS t1 from emp ;
--range : 计算的是值
SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc RANGE BETWEEN 2000 PRECEDING AND 2000 FOLLOWING) AS t1 from emp ;
//rows计算的行
SELECT empno, deptno, salary, LAST_VALUE(salary) OVER (PARTITION BY deptno ORDER BY salary desc rows BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS t1 from emp ;
--窗口重用
SELECT empno, deptno, salary , MAX(salary) OVER w1 AS mx,MIN(salary) OVER w1 AS mn,AVG(salary) OVER w1 AS ag from emp WINDOW w1 AS (PARTITION BY deptno ORDER BY salary desc ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) ;
[开窗函数]
range|rows between ... and ;
range是值偏移,rows是行偏移。
2.统计员工数量
select max(salary) from emp group by deptno union select max(salary) from emp group by age union select max(salary) from emp group by deptno,age ;
hive高级聚合
---------------
1.grouping set
作用等同于union.
select deptno , age , count(1) from emp group by deptno,age grouping sets(deptno ,age ,(deptno,age) ) ;
3.rollup
select ... from ... GROUP BY a,b,c WITH ROLLUP ;
select ... from ... GROUP BY a,b,c GROUPING SETS ((a,b,c),(a,b),(a),())
select deptno , age , count(1) from emp group by deptno,age with rollup ;
4.cube
select ... from ... GROUP BY a,b,c WITH cube ;
select ... from ... GROUP BY a,b,c GROUPING SETS ((a),(a,b),(a,c),(a,b,c) ,(b),(b,c),(c),()))
select deptno , age , count(1) from emp group by deptno,age with cube ;
hive优化
-------------------
1.创建索引
本质上就是表,对于检索单条记录是有优势的。
排序的。
--创建索引
CREATE INDEX idx_emp ON TABLE emp (empno) AS 'COMPACT' WITH DEFERRED REBUILD;
--生成索引
ALTER INDEX idx_emp ON emp REBUILD;
--查询是通过索引列查询
select * from emp where empno = 'tom1'
2.文件格式
[列存储格式]
parquet , orcfile , orc //
投影查询时,发挥磁盘的线性读写。
select id,name from emp ;
//创建表,存储成parquet格式
create table pq1(id int , empno string, age int ,salary float , deptno int ) stored as parquet ;
//查看文件格式
insert into pq1 select * from emp ;
[行存储]
txt
3.压缩
减少数据传输量,降低网络IO的负载。
--在多级job中,job之间的结果是否需要压缩。
SET hive.exec.compress.intermediate=true ;
SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec ;
SET hive.intermediate.compression.codec=org.apache.hadoop.io.compress.GzipCodec ;
SET hive.intermediate.compression.type=record|block|none ;
--控制job的最终输出是否压缩.
SET hive.exec.compress.output=true;
SET mapred.output.compression.codec= org.apache.hadoop.io.compress.GzipCodec;
4.大量小文件
导致map过多。
1.归档
hadoop Archive and HAR进行归档。
2.转换成序列文件
将小文件合成SeqenceFile.
3.org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat
alter table emp set inputformat=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat
--修改默认输入格式
set hive.input.format=org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat ;
--建新表
create table emp2
(
id int ,
empno string ,
age int ,
salary float ,
deptno int
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS
INPUTFORMAT 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'