select a.id, a.month from user_b a union all select b.id, b.day from user_b b;
桶表:和hash partition类似
=============================================
bucket //clustered(id) into 2 buckets
为了避免分区表过多产生海量文件夹
文件段形式存在
分区+分桶表插入问题
insert into xx partition(''='',''='') select 1,'tom',20;
000000 //第一桶
000001 //第二桶
select size(arr)
select array(1,2,3) //创建array
select map(1,'tom',2,'tomas') //创建map
select struct(1,'tom','male') //创建匿名结构体 ['col1':1,'col2':'tom' ...]
select named_struct('id',1,'name','tom','sex','male') //创建带名结构体
select current_date(); //当前日期
select current_timestamp(); //当前时间(精确到毫秒)
select date_format(current_date(),'yyyyMMdd') //日期转换
select date_format(current_timestamp(),'yyyyMMdd') //日期转换
select from_unixtime(bigint,'yyyyMMdd') //将时间戳转换成时间
hive的文件格式:stored as parquet;
===========================================
插入方法:创建text,然后insert into xx select * from
行级存储
text
seqFile 1M
列级存储 在进行投影查询的时候,会跳过不相关的列
rcFile 4M块
orcFile 能够支持更大的块256M
parquet 支持更多hadoop生态圈组件
SerDe:
=========================================
serialize && deserialize
将文件字段映射成hive表中的列
使用阶段:在inputFormat之后
textFile:lazySimpleSerde
openCsvSerde
JsonSerde //创建时字段务必与json中的key对应
创建表指定serde:
row format serde 'org.xxx.xxx.JsonSerde';
在hive字段中以特殊字符开头的字段
create table xx(`_location` string);
分析函数:
=================================
sum()over(sort by age rows between unbounded preceding and unbounded following); //排序并界定窗口
sum()over(sort by age range between unbounded preceding and unbounded following);
sum()over(partition by province order by age range between unbounded preceding and unbounded following); //分组排序+界定窗口
current row //当前行
Hive的分组方式
row_number() 是没有重复值的排序(即使两天记录相等也是不重复的),可以利用它来实现分页
dense_rank() 是连续排序,两个第二名仍然跟着第三名
rank() 是跳跃排序的,两个第二名下来就是第四名
-- percent_rank()百分比排名,相对排名.
SELECT empno, deptno, salary, percent_RANK() OVER (PARTITION BY deptno ORDER BY salary desc) from emp ;
rank() //并列跳跃 113
dense_rank() //不跳跃 112
row_number() //顺序 123
ntile(n) //三六九等
first_value() //取第一个值
lead() //窗口上浮
//连续两个月活跃
select id , m , n from
(select id, (month+1) as m , lead(month)over(partition by id order by month) as n from user_c) a
where a.m=a.n;
//select id ,m ,n from(select id m(month+1)as m , lead(month) over(partition by id order by month) as n from user_c )a where a.m=a.n;
//连续三个月活跃
select distinct id from (select id, (month+2) as first , (lead(month)over(partition by id order by month)+1)
as second, lead(month,2)over(partition by id order by month) as third from user_c) a
where a.first=a.second and a.second=a.third;
//select distinct id from (select id ,(month+2) as first,(lead(month)over(partition by id order by month)+1)
//as second,lead(month,2)over (partition by id order by month)as third from user_c) a where a.first = a.second and a.second=a.third;
lag
pv:page view //页面浏览量,统计总浏览数
uv:user view //用户浏览数,统计用户数
根据pv,统计uv:
select month,day, count(distinct id) as uv from user_b group by month,day;
高级聚合函数:**********************
grouping__id //组号
grouping sets 组集 //分别统计月活和日活
select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
grouping sets(month,day) order by grouping__id ;
//select month,day count(distinct id) as uv grouping_id from user_b group by month,day grouping sets(month,day)order by grouping_id;
+----------+-------------+-----+---------------+--+
| month | day | uv | grouping__id |
+----------+-------------+-----+---------------+--+
| 2015-04 | NULL | 6 | 1 |
| 2015-03 | NULL | 5 | 1 |
| 2015-02 | NULL | 2 | 1 |
| NULL | 2015-04-16 | 2 | 2 |
| NULL | 2015-04-15 | 2 | 2 |
| NULL | 2015-04-13 | 3 | 2 |
| NULL | 2015-04-12 | 2 | 2 |
| NULL | 2015-03-12 | 1 | 2 |
| NULL | 2015-03-10 | 4 | 2 |
| NULL | 2015-02-16 | 2 | 2 |
+----------+-------------+-----+---------------+--+
rollup 汇总
select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
with rollup order by grouping__id ;
//select month,day count (distinct id )as uv grouping_id from user_b group by month,day with rollup order by grouping_id;
+----------+-------------+-----+---------------+--+
| month | day | uv | grouping__id |
+----------+-------------+-----+---------------+--+
| NULL | NULL | 7 | 0 |
| 2015-04 | NULL | 6 | 1 |
| 2015-03 | NULL | 5 | 1 |
| 2015-02 | NULL | 2 | 1 |
| 2015-04 | 2015-04-16 | 2 | 3 |
| 2015-04 | 2015-04-15 | 2 | 3 |
| 2015-04 | 2015-04-13 | 3 | 3 |
| 2015-04 | 2015-04-12 | 2 | 3 |
| 2015-03 | 2015-03-12 | 1 | 3 |
| 2015-03 | 2015-03-10 | 4 | 3 |
| 2015-02 | 2015-02-16 | 2 | 3 |
+----------+-------------+-----+---------------+--+
select day,month count(distinct id) as uv, grouping__id from user_b group by day,month
with rollup order by grouping__id ;
//selectt day ,month count(distinct id ) as uv,grouping_id form user_b group by day,month with rollup order by grouping_id;
+-------------+----------+-----+---------------+--+
| day | month | uv | grouping__id |
+-------------+----------+-----+---------------+--+
| NULL | NULL | 7 | 0 |
| 2015-04-16 | NULL | 2 | 2 |
| 2015-04-15 | NULL | 2 | 2 |
| 2015-04-13 | NULL | 3 | 2 |
| 2015-04-12 | NULL | 2 | 2 |
| 2015-03-12 | NULL | 1 | 2 |
| 2015-03-10 | NULL | 4 | 2 |
| 2015-02-16 | NULL | 2 | 2 |
| 2015-04-16 | 2015-04 | 2 | 3 |
| 2015-04-13 | 2015-04 | 3 | 3 |
| 2015-04-12 | 2015-04 | 2 | 3 |
| 2015-03-12 | 2015-03 | 1 | 3 |
| 2015-03-10 | 2015-03 | 4 | 3 |
| 2015-02-16 | 2015-02 | 2 | 3 |
| 2015-04-15 | 2015-04 | 2 | 3 |
+-------------+----------+-----+---------------+--+
cube 魔方
select month,day, count(distinct id) as uv, grouping__id from user_b group by month,day
with cube order by grouping__id ;
//select month,day ,count(distinct id) as uv ,grouping_id from user_b group by month,day with cube order by grouping_id;
2 1, 2, [1,2]
3 1,2,3, [1,2],[1,3],[2,3],[1,2,3]
4
+----------+-------------+-----+---------------+--+
| month | day | uv | grouping__id |
+----------+-------------+-----+---------------+--+
| NULL | NULL | 7 | 0 |
| 2015-02 | NULL | 2 | 1 |
| 2015-04 | NULL | 6 | 1 |
| 2015-03 | NULL | 5 | 1 |
| NULL | 2015-02-16 | 2 | 2 |
| NULL | 2015-04-16 | 2 | 2 |
| NULL | 2015-04-15 | 2 | 2 |
| NULL | 2015-04-13 | 3 | 2 |
| NULL | 2015-04-12 | 2 | 2 |
| NULL | 2015-03-12 | 1 | 2 |
| NULL | 2015-03-10 | 4 | 2 |
| 2015-04 | 2015-04-12 | 2 | 3 |
| 2015-03 | 2015-03-12 | 1 | 3 |
| 2015-03 | 2015-03-10 | 4 | 3 |
| 2015-04 | 2015-04-16 | 2 | 3 |
| 2015-02 | 2015-02-16 | 2 | 3 |
| 2015-04 | 2015-04-15 | 2 | 3 |
| 2015-04 | 2015-04-13 | 3 | 3 |
+----------+-------------+-----+---------------+--+
cookie1 1
cookie2 5
cookie2 4
cookie1 3
cookie2 7
cookie1 4
cookie2 2
cookie3 2
cookie2 3
cookie3 5
cookie1 6
cookie3 10
cookie2 8
hive的事务性:
===================================
1、配置文件
SET hive.support.concurrency=true;
SET hive.enforce.bucketing=true;
SET hive.exec.dynamic.partition.mode=nonstrict;
SET hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
SET hive.compactor.initiator.on=true;
SET hive.compactor.worker.threads=1;
2、桶表
3、指定事务开启
4、orc文件
create table user3(id int, name string, age int, province string, city string)
clustered by(province) into 2 buckets
row format delimited
fields terminated by '\t'
stored as orc
tblproperties('transactional'='true');
5、insert into user3 select * from user1;
6、update user3 set age=200 where id=5;
采样:取出大型数据集的子集
============================================
随机采样:
select * from user2 distribute by rand() sort by rand() limit 10;
//select * from user2 distribite by rand() sort rand() limit 10;
随机模式分发+ 随机模式排序
桶表采样:
对于桶表的采样优化
select name from user1 TABLESAMPLE(bucket 1<指定桶数> out of 4<总桶数> on rand()) ;
//select name from user1 tablesample(bucket 1 out of 4 on rand());
随机取得某个桶的数据,
create table user1_bucket(id int, name string, age int, province string, city string)
clustered by(province) into 4 buckets
row format delimited fields terminated by '\t';
insert into user1_bucket select * from user1;
1 tom 30 anhui anhui
2 tomas 30 hefei
1 tom 30 heilongjiang
2 tomas 30 jilin
1 tom 30 liaoning
2 tomas 30 neimenggu
1 tom 30 shandong
2 tomas 30 shanxi
1 tom 30 qinghai
2 tomas 30 jiangsu
1 tom 30 gansu
2 tomas 30 ningxia
1 tom 30 hubei
2 tomas 30 sichuan
1 tom 30 chongqing
2 tomas 30 taiwan
1 tom 30 xianggang
2 tomas 30 guangdong
块采样:可以指定文件大小、行数、百分比进行数据的采样
select * from users TABLESAMPLE(5 percent);
select * from users TABLESAMPLE(30 M);
select * from users TABLESAMPLE(2 rows);
用户定义函数
=====================
1、UDF //单行转单行 add(id,name) ===> idname
//处理单位一行,返回一行
入口点:UDF类
2、UDTF //单行转多行 explode(arr) ===> arr[1]
arr[2]
arr[3]
3、UDAF //多行转单行 sum(age) ===> 500
编写hive自定义函数:
==========================================
1、pom文件
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>2.1.1</version>
</dependency>
2、编写代码
注意:对于hive的UDF,不支持变长参数,但是支持list
@Description(
name = "add",
value = "this is an add function",
extended = "select add(1,2,3) ====> 6 ; select add('hello','world') ====> helloworld"
)
public class UDFAdd extends UDF{
/**
* 将所有i进行相加
* @param i
* @return
*/
public Integer evaluate(int i, int j){
return i+ j;
}
public String evaluate(String i, String j){
return i+ j;
}
public int evaluate(List<Integer> i){
int j = 0;
for (Integer integer : i) {
j += integer;
}
return j;
}
}
3、加载并使用jar
第一个方法:打包并将其复制到hive的lib文件夹 //hive的默认类路径
第二个方法:add jar /soft/hive/lib/myhive-1.0-SNAPSHOT.jar; //手动指定hive类路径
第三个方法:修改配置文件:hive-site.xml //hive.aux.jars.path=/x/x/x.jar
4、将类加载成hive的函数
创建函数[临时函数]
beeline> create temporary function add as 'com.oldboy.hive.UDFAdd';
[永久函数]
beeline> create function add as 'com.oldboy.hive.UDFAdd'
using jar 'hdfs:///path/to/x.jar';
该方式非常重要,完全分布式时需要使用该方式,否则找不到函数类。
5、删除方法:
drop temporary function add ;
1、将商家的标签,通过udf进行解析并返回标签
json ==== udf + fastJson ===> {'味道好','服务好'}
如何在hive中使用fastJson? //将其拷贝到hive的lib文件夹并重启hive
1)编写程序并打包程序
2)将jar和fastJson文件传送到hive的lib文件夹
3)添加临时函数create temporary function parseJson as 'com.oldboy.hive.TempTagUDF';
4)创建temptag表,字段包括id和json,以'\t'作为分隔
5)将TempTag.txt加载到hive的temptag表中
6)使用udf对商家评论进行操作
7)统计各个商家去重之后的标签数
select id, count(distinct tag) from temptags lateral view explode(parseJson(json)) xx as tag group by id;
8)统计各个商家各个标签的数量
select id, tag, count(tag) from temptags lateral view explode(parseJson(json)) xx as tag group by id,tag ;
!!!!出现问题:类找不到异常
Caused by: java.lang.ClassNotFoundException: com.oldboy.hive.TempTagUDF
原因分析:hive在使用MR操作时,在其他节点接受不到TempTagUDF类所在jar
解决:将udf的jar和fastjson一同复制到${HADOOP_HOME}/share/hadoop/common/lib下,并同步到其他节点
不需要重启hadoop
maven项目设置编译器为jdk1.8(默认1.5) //放在<project>标签下
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
虚列:
======================
> select name, explode(work_place) from default.employee; //udtf不支持外部的select从句
UDTF's are not supported outside the SELECT clause, nor nested in expressions (state=42000,code=10081)
lateral view //对表生成函数的补充
select id, tag from temptags lateral view explode(parseJson(json)) xx as tag ;
//虚列
//lateral view explode(parseJson(json)) xx as tag (xx无意义,占位符)
select id , explode()
UDTF:
===================================
StructObjectInspector 类
ObjectInspector[]
create temporary function wc as 'com.oldboy.hive.TestUDTF';
LazySimpleSerDe中的lazy格式,默认string格式,只有在使用或者声明格式的时候才进行转换
hive优化:
=====================================
性能工具:
EXPLAIN //解释hive运行过程中MR作业整体流程
//explain select count(1) from wc;
Analyze //在下一次执行的时候使用CBO(cost-based-optimize)基于成本的优化来执行作业
//analyze table wc compute STATISTICS
//desc formatted wc =====> 能看到文件行数统计以及文件大小
设计优化:
分区表: //以日期+时间、以location、以业务逻辑为分区字段,优化分区(对where子句查询的优化)
//create table xx() partition by (province string, city string);
分桶表: //可以进行采样、对join的优化比较好(分桶字段如果和join字段一致,
//在join操作时会选择分区内部的桶文件段,避免了全文件扫描)
//create table xx() clustered by(province) into 2 buckets;
创建索引: // CREATE INDEX idx_id_employee_id ON TABLE employee_id (id)
AS 'COMPACT' WITH DEFERRED REBUILD; //创建compact索引
// CREATE INDEX idx_id_employee_id ON TABLE employee_id (id)
AS 'BITMAP' WITH DEFERRED REBUILD; //创建位图索引
执行引擎优化: //hive2中已经不推荐使用mr作为执行引擎
//推荐使用spark、tez作为执行引擎
数据文件格式优化:、
数据格式:
text
seqFile
RCFile
ORCFile
Parquet
压缩(default\gzip\lzo\bzip2\lz4\snappy)
中间数据压缩: //分担网络间分发压力和磁盘存储压力
//set hive.exec.compress.intermidiate=true
//set hive.intermidiate.compression.codec=org.apache.hadoop.io.compress.SnappyCodec
输出文件压缩 //减小磁盘存储压力
//set hive.exec.compress.output=true
//set mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec
存储优化:
1、使用har文件
2、使用SeqFile格式
3、使用CombineFileInputFormat格式 //hive自动优化手段,将多个小文件合并为单个文件
4、使用hdfs联邦 //在其他节点存储namenode数据,水平扩展其容量(不实用)
5、配置文件:
set hive.merge.mapfiles=true //仅map任务时合并输出的小文件
set hive.merge.mapredfiles=true //MR作业时合并输出小文件
set hive.merge.size.per.task=256000000 //定义job合并文件的大小
set hive.merge.smallfiles.avgsize=16000000 //定义合并小文件的触发阈值
作业和查询优化:
使用本地模式:
beeline本地模式(不用开启hive2):beeline -u jdbc:hive2:// -n centos
hive本地模式触发条件:
SET hive.exec.mode.local.auto=true; //本地模式自动进入
SET hive.exec.mode.local.auto.inputbytes.max=50000000; //自动进入本地模式的输入阈值,超过则退出本地模式
SET hive.exec.mode.local.auto.input.files.max=5; //自动进入本地模式的文件个数阈值,超过则退出本地模式
jvm重用:仅适用于MR一代,yarn不适用
MR作业会共享jvm虚拟机内存而非全部并行计算
set mapred.job.reuse.jvm.num.tasks=5
并行计算:多用于分布式作业,之在不同主机中同时进行同一作业的处理
并发执行:多用于线程
在hive作业执行时,多个stage之间不一定相互依赖,在此时可以设置并行执行
set hive.exec.parellel=true;
set hive.exec.parellel.thread.number=16;
join优化:**********
Common join //reduce 端的join
//通过暗示指定大表/*+ STREAMTABLE(bigtable) */
Map join //map 端
//通过暗示指定小表/*+MAP JOIN(smalltable) */
SET hive.auto.convert.join=true;
--default false
SET hive.mapjoin.smalltable.filesize=600000000;
--default 25M 超过此值,使用reduce端join
SET hive.auto.convert.join.noconditionaltask=true;
--default false. true说明不需要暗示
SET hive.auto.convert.join.noconditionaltask.size=10000000;
--控制表大小和内存的适配
桶表join优化: //SET hive.auto.convert.join=true; --default false
//SET hive.optimize.bucketmapjoin=true; --default false
!!!!!!!!join端数据倾斜处理
//SET hive.optimize.skewjoin=true; 进行负载均衡
SET hive.skewjoin.key=100000; //在reduce中如果一个reduce接受的数据超过此值,会自动发送给空闲的reduce
group by优化
!!!!!!!!group by数据倾斜处理
SET hive.groupby.skewindata=true;
1、请把下一语句用hive方式实现?
SELECT a.key,a.value
FROM a
WHERE a.key not in (SELECT b.key FROM b)
答案:
select a.key,a.value from a where a.key not exists (select b.key from b)
2、Multi-group by 是hive的一个非常好的特性,请举例说明? 多组
from A
insert overwrite table B
select A.a, count(distinct A.b) group by A.a
insert overwrite table C
select A.c, count(distinct A.b) group by A.c
//from A insert overwrite table B select A.a count(A.b) group by A.a insert overwrite table C select A.c count(A.b)group by A.c
3、写出将 text.txt 文件放入 hive 中 test 表‘2016-10-10’ 分区的语句,test 的分区字段是 l_date。
LOAD DATA LOCAL INPATH '/your/path/test.txt' OVERWRITE INTO TABLE test PARTITION (l_date='2016-10-10')
https://blog.csdn.net/haohaixingyun/article/details/52819588网页连接
https://blog.csdn.net/ukakasu/article/details/47860647面试题是实例,处理大表数据。里面有原题连接。