hive 命令大全 hive优化 数据倾斜 分区分桶
SELECT T.sname,T.ctfid,T.gender,t.address,
count(*) OVER(PARTITION BY T.sname) AS FM_CNT
FROM test_db.room3 T WHERE T.address like "%北京%" AND instr(T.ctfid,'310')>0
ORDER BY FM_CNT DESC;
select gender,sum(cnt) as cnt from test_db.room2_view where length(trim(gender))>0 group by gender order by cnt desc;
desc test_db.room2;
-- DROP TABLE default.room2_gender_out PURGE ;
-- 删除数据库
//DROP DATABASE db_hive1;
// 1.创建数据库
CREATE DATABASE IF NOT EXISTS db_hive1
COMMENT "hive学习教程数据库" --数据库备注
LOCATION '/hivedata/db_hive1' --数据库存储hdfs路径
WITH DBPROPERTIES ("create_dt"="2025-12-07","create_user"="hadoop") --数据库信息k,v
;
//查看数据库
SHOW DATABASES LIKE "*db*";
//查看数据库的信息
describe DATABASE EXTENDED db_hive1;
//查看更多信息
DESC DATABASE EXTENDED db_hive1;
//2.修改数据库
ALTER DATABASE db_hive1 set DBPROPERTIES ("create_dt"="2025-12-08","create_user"="wwwdp") ;
ALTER DATABASE db_hive1 set LOCATION '/hivedata/db_hive1'; --数据库存储hdfs路径
ALTER DATABASE db_hive1 set OWNER USER hadoop; -- 修改数据库归属用户
-- 删除数据库
//DROP DATABASE db_hive1 RESTRICT ; --数据库不为空删除失败
//DROP DATABASE db_hive1 CASCADE ; -- 将表和库一并 删除
//切换数据库
use test_db;
// 3 建表操作
-- 内部表
drop table db_hive1.tb_test1 purge ;
CREATE TABLE db_hive1.tb_test1
(dat_dt date COMMENT "DATE",
idky bigint COMMENT "主键",
amt decimal(30,4) COMMENT "AMT",
name varchar(4000) COMMENT "NAME",
work_list array<string> COMMENT "ARRAY",
addr_map MAP<string,string> COMMENT "Map"
)
COMMENT "nei bu biao"
--location "/hivedata/db_hive1/tb_test1"
-- row format delimited
-- ROW FORMAT SERDE ''
STORED AS orc -- textfile/orc/parquet
TBLPROPERTIES ("create_dt"="20251207")
;
INSERT INTO db_hive1.tb_test1 VALUES
(date '2023-10-01', 123456789, 1234.5678, 'John Doe',
array('work1', 'work2'), map('city', 'New York', 'country', 'USA'));
select * from db_hive1.tb_test1 t;
select t.dat_dt,t.idky,t.name,
t.work_list[0],
t.work_list[1],
t.addr_map["city"],
t.addr_map["country"]
from db_hive1.tb_test1 t;
-- 查看建表信息
desc extended db_hive1.tb_test1;
//会话临时表
CREATE TEMPORARY TABLE db_hive1.tb_test2
(dat_dt date,
idky int,
name string)
COMMENT "nei bu biao"
STORED AS ORC -- textfile/orc/parquet
;
-- select * from db_hive1.tb_test2;
--外部表
CREATE EXTERNAL TABLE db_hive1.tb_test3
(dat_dt date,
idky int,
name string)
COMMENT "nei bu biao"
location "/hivedata/db_hive1/tb_test3"
STORED AS orc -- textfile/orc/parquet
TBLPROPERTIES ("tab"="tb_test3")
;
desc extended db_hive1.tb_test3;
select date '2024-10-01' dat;
-- 数据类型
select cast('908.342' as decimal(30,4)) as vles;
-- 建表 like语法
CREATE TABLE db_hive1.room4 like db_hive1.tb_test1;
-- 建表 as语法
drop table db_hive1.room4 purge ;
create table db_hive1.room4
stored as parquet
as
select * from test_db.room3;
use db_hive1;
select * from db_hive1.room4 r where r.address like "%剑河%";
-- 查看 ddl 建表语句
show create table db_hive1.tb_test1;
SELECT T.sname,T.ctfid,T.gender,t.address,
count(*) OVER(PARTITION BY T.sname) AS FM_CNT
FROM db_hive1.room4 T WHERE T.address like "%北京%" AND instr(T.ctfid,'310')>0
ORDER BY FM_CNT DESC;
-- 建表指定分隔符
CREATE TABLE db_hive1.tb_test5
(dat_dt date,
idky int,
name string)
COMMENT "nei bu biao"
location "/hivedata/db_hive1/tb_test5"
ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS orc -- textfile/orc/parquet
TBLPROPERTIES ("tab"="tb_test3")
;
-- 可以使用 hdfs 提交数据值 tb_test5 要求 input.txt 每行字段直接分隔符苇 |@|
-- hdfs dfs -put input.txt /hivedata/db_hive1/tb_test5
-- 创建json对接hive表可以直接 put就可以插入数据
drop table db_hive1.teacher;
create table db_hive1.teacher(
name string comment "name",
friend array<varchar(256)> comment "fred",
students map<string,int> comment "students",
adderss struct<city:string,amt:decimal(30,4),room:array<int>> comment "adderss"
)
--row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe' -- 指定 json
--location "/hivedata/db_hive1/teacher"
;
INSERT INTO db_hive1.teacher
select "hadoop1" as name,
array("fdaf","fdafdfff") as friend,
map("city",64) as students,
struct("guiyangshi",123.543,array(9,0,32)) as adderss;
INSERT INTO db_hive1.teacher
values ("hadoop1",
array("fdaf", "fdaffdas"),
map("city", 64),
struct("guiyangshi", 123.543, `array`(1,2,3,4)));
select * from teacher;
// 查看表信息
desc extended teacher;
desc formatted teacher;
show tables in db_hive1 "teacher" ;
//修改表的语法
drop table stu;
create table db_hive1.stu
(id int,
name string)
row format delimited fields terminated by ",";
ALTER TABLE stu RENAME TO stu1;
//添加字段
ALTER TABLE stu add columns(gender string);
-- 修改字段
set hive.metastore.disallow.incompatible.col.type.changes=false;
alter table stu1 change column gender gend string after name;
alter table db_hive1.stu1 replace columns(id int,name string,gend string);
-- 清空表只对内部表有用,对外部表无效
truncate table db_hive1.stu;
desc stu;
-- 二、 数据增删改查 DML
-- 本地导入 OVERWRITE 覆盖的意思,不加则 追加。 inpath 对应的local 节点是 你服务启动的 节点
LOAD DATA LOCAL INPATH "/home/hadoop/stu.txt" INTO TABLE db_hive1.stu;
-- HDFS分布式文件路径
LOAD DATA INPATH "" OVERWRITE INTO TABLE db_hive1.stu;
-- 分区表
-- LOAD DATA INPATH "" OVERWRITE INTO TABLE db_hive1.stu PARTITION (....);
select * from db_hive1.stu;
//多条数据插入
insert into db_hive1.stu
values(1,"faf"),
(2,"test");
-- 把查询sql的数据 写入到本地对应路径的
insert overwrite local directory "/home/hadoop/data/stu"
row format serde "org.apache.hadoop.hive.serde2.JsonSerDe"
select id,name from db_hive1.stu;
set mapreduce.job.reduces=1; -- 设置为导出到一个文件里面
insert overwrite local directory "/home/hadoop/data/stu"
row format delimited --serde "org.apache.hadoop.hive.serde2.JsonSerDe"
select id,name from db_hive1.stu;
-- 导出和导入
EXPORT TABLE db_hive1.stu to "/hivedata/stuout"; -- 导出数据及元数据,到hdfs 文件系统上
IMPORT /*external*/ TABLE stu from "hivedata/stuout"; -- 导入表元数据及数据 external 声明外部表
// 三、hive查询语句
//SELECT * FROM stu WHERE ID = 1 GROUP BY id HAVING count(*)>1 ORDER BY id LIMIT 1,10;
SELECT * FROM test_db.room2_view limit 4;
SELECT * FROM test_db.room2_view w where not w.gender ='12';
SELECT * FROM db_hive1.stu T WHERE T.id BETWEEN 0 AND 2;
SELECT * FROM db_hive1.stu T WHERE nvl(T.id,1) IN(0,1);
SELECT * FROM db_hive1.stu T where T.name like "%f%";
SELECT * FROM db_hive1.stu T where T.name regexp "正则表达式";
-- 逻辑语句 and or not
-- 聚合函数 count(*) avg() max() min(*)
-- 本地模式
// set mapreduce.framework.name local;
select * from stu;
SELECT
row_number() over (order by t.ctfid) as rn,
T.* FROM db_hive1.room4 T LIMIT 1000;
SELECT t.ctftp,count(*) as cnt FROM db_hive1.room4 t where t.ctftp is not null group by t.ctftp order by count(*) desc;
-- 排序语法 order by 语法 后面尽量跟limit 这个直接优化map端 不会导致 内存溢出
SELECT t.ctftp,count(*) as cnt FROM db_hive1.room4 t
where t.ctftp is not null group by t.ctftp order by count(*) DESC limit 100;
set mapreduce.job.reduces;
SELECT count(1) FROM test_db.room3;
WITH TEMP AS (
SELECT t.ctftp,count(*) as cnt FROM db_hive1.room4 t
where t.ctftp is not null group by t.ctftp order by count(*) DESC limit 100
)
select * from TEMP where ctftp NOT like 'I%';
-- ORDER BY 和 sort by 的区别,order by 全局排序, sort by 指的是 map 到 reduce 端的排序,它只保证单个reduce 排序。
WITH result as (
select * from db_hive1.room4
)
,TEMP AS (
SELECT t.ctftp,count(*) as cnt FROM result t
where t.ctftp is not null group by t.ctftp order by count(*) DESC limit 100
)
SELECT * FROM TEMP SORT BY ctftp DESC ; -- 只对单个分区进行排序
select * from test_db.room3;
-- 函数
show functions ;
DESC FUNCTION substring;
DESC FUNCTION EXTENDED substring;
-- UDF 一进一出
SELECT round(3.342,2) AS RS;
SELECT substring("12345678",3,2);
SELECT substr("12345678",3,2);
SELECT substr("12345678",3);
-- 时间字符串转为时间字符串
SELECT unix_timestamp();-- 当前时间戳
SELECT unix_timestamp("2025-12-10 22:29:30","yyyy-MM-dd HH:mm:ss");
DESC FUNCTION unix_timestamp;
-- 时间戳转时间字符串
SELECT from_unixtime(unix_timestamp()) dat;
SELECT from_unixtime(1765423770,'yyyy-MM-dd HH:mm:ss') dat;
SELECT from_utc_timestamp(`cast`(1765378139 AS BIGINT)*1000,'GMT+8');
SELECT current_date();
select date_format(current_timestamp(),'yyyy-MM-dd HH:mm:ss');
SELECT hour(current_timestamp());
select datediff();
select date_add('2025-12-10',1); -- 加天数
select date_sub('2025-12-10',1); -- 减天数
SELECT date_format(date '2025-12-10','yyyy-MM-dd');
-- 条件控制语句
select case when 1=2 then 0 else 1 end ;
select `if`(1=6,1,0);
select * from test_db.room2_view T WHERE T.gender IS NOT NULL;
select * from db_hive1.room4;
-- 集合函数
SELECT `array`(1,2,34);
select `map`("city","gzhou","addr","adfas","rs",1);
-- 判断元素是否存在 数组里面
select array_contains(array(1,2,34),1);
select sort_array(array(1,2,34,3,5,7));
select size(array(1,2,34,3,5,7));
select map_keys(map("city","gzhou","addr","adfas","rs",1));
select map_values(map("city","gzhou","addr","adfas","rs",1));
select struct("fdafj","fafda","fdaf");
select named_struct("key1",1,"cl1","fdaf");
select month(`current_date`());
select date '2025-12-11';
select datediff(date '2025-12-11' , date '2024-11-11' ) as dat;
select decode(1,1,1,0);
desc function extended decode;
SELECT decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic');
-- 高级集合函数
SELECT
collect_list(gender) a_list, -- 把多行聚合为一行
collect_set(cnt) as a_set -- 把多行聚合为一行 去重
FROM test_db.room2_view WHERE length(trim(gender))>0;
-- UDTF 一进多出
SELECT explode(ARRAY("21",'22',324));
SELECT posexplode(ARRAY("21",'22',324));
SELECT inline(ARRAY(struct('DFS',1,"FDFSFS",1.098)));
SELECT T.idky,posexplode(T.addr_map) FROM db_hive1.tb_test1 T;
select a.name,inline(`array`(a.adderss)),a.adderss from teacher a;
select A.name,explode(A.friend) as fd
from db_hive1.teacher A ;
select A.name,A.friend,
fd
from db_hive1.teacher A
lateral view explode(a.friend) tem as fd;
;
select t.adderss["room"] from db_hive1.teacher t;
select posexplode(split("wzp,jq,xiaowang",','));
-- 窗口函数
SELECT * FROM (
select
R.sname,
r.ctfid,
count(*) over (partition by r.ctfid) cnt
from test_db.room3 r WHERE R.ctfid IS NOT NULL
) W WHERE W.CNT >1
;
-- 如何定义窗口范围,基于行的 和基于值的
select
gender,cnt,
sum(cnt) over (order by v.gender desc rows between 2 preceding and current row ) as lj
from test_db.room2_view v where length(trim(v.gender))>0
--order by v.gender asc--limit 100
;
-- 累计求和
select
gender,cnt,
sum(cnt) over (order by v.gender desc
rows between unbounded preceding and current row ) as lj
from test_db.room2_view v where length(trim(v.gender))>0
;
-- 基于值的 窗口函数 order by 不是排序, 而是居于 那个字段对应的值进行划分窗口
-- order by [column] rows/range between 起点 and 终点
-- 对每一个 gender 分组值进行 累和
select
gender,cnt,
sum(cnt) over (partition by v.gender order by v.cnt asc
rows between unbounded preceding and current row ) as lj
from test_db.room2_view v where length(trim(v.gender))>0
;
select * from test_db.room2;
select * from test_db.room2_gender_out;
select * from test_db.room3;
drop table db_hive1.room4 purge;
create table db_hive1.room4
-- clustered by (id) -- 按照哈希值进行分区 也就是分桶表
-- sorted by (id)
-- into 3 buckets -- 4个文件块
stored as textfile row format delimited fields terminated by "^|"
as select * from test_db.room3;
select * from db_hive1.room4;
SELECT explode(T.friend) AS friend,inline(array(t.adderss)) FROM db_hive1.teacher T;
desc function extended inline;
-- /*+ BROADCAST(a) */
select /*+ BROADCAST(a) */ a.gender,count(1) as cnts from test_db.room2_gender_out a
left join db_hive1.room4 b on a.gender = b.gender
where length(trim(a.gender))>0
group by a.gender order by count(*) desc limit 10
;
SELECT * FROM test_db.room2;
SELECT * FROM test_db.room3;
SELECT * FROM db_hive1.room4;
desc extended room4;
SELECT t.id,count(*) as cnt FROM test_db.room2 t group by t.id having count(*)>1 limit 100;
SELECT * FROM test_db.room2 r where r.id is null;
select /*+ BROADCAST(a) */ a.gender,count(1) as cnts from test_db.room2_gender_out a
left join test_db.room2 b on a.gender = b.gender
where length(trim(a.gender))>0
group by a.gender order by count(*) desc limit 10
;
-- 排名 有order by 从第一行往后累计 无order by 累计全部值
SELECT T.CUST_ID,count(*) as cnt,
row_number() over (order by count(*) desc) as rn,
rank() over (order by count(*) desc) as rn1,
dense_rank() over (order by count(*) desc) as rn2
FROM db_hive1.t2a_trans T GROUP BY T.CUST_ID;
SELECT distinct T.CUST_ID,
sum(t.TR_AMT) over () as rn -- 等于 全局求个
FROM db_hive1.t2a_trans T ;
-- 获取 上下 的值 LEAD LAG
WITH TEMP AS (SELECT T.CUST_ID,count(*) as cnt,
row_number() over (order by count(*) desc) as rn
FROM db_hive1.t2a_trans T GROUP BY T.CUST_ID)
SELECT P.*,
LEAD(P.rn) over (order by p.rn asc) AS TOP_NUM, -- 获取 rn排序 下一行的值
LAG(P.rn,1,0) OVER(order by p.rn asc) AS LOW_NUM --获取排序上一行的值
FROM TEMP P;
-- 窗口获取 first_value 和 last_value 第一和最后一行的值
WITH TEMP AS (
SELECT T.CUST_ID,count(*) as cnt,
row_number() over (order by count(*) desc) as rn
FROM db_hive1.t2a_trans T GROUP BY T.CUST_ID)
SELECT P.*,
-- LEAD(P.rn) over (order by p.rn asc) AS TOP_NUM, -- 获取 rn排序 下一行的值
-- LAG(P.rn,1,0) OVER(order by p.rn asc) AS LOW_NUM, --获取排序上一行的值
first_value(p.rn,false) over (order by p.rn asc ) as first_num, -- 第二参数 是否需要跳过null值
last_value(p.cnt,false) over (order by p.rn asc
-- rows between current row and unbounded following
) as last_num -- 第二参数 是否需要跳过null值
FROM TEMP P ;
-- hive基础当中 最后一次知识点 自定义函数
-- udf 一进一出 udaf 一进多出 udtf 多进一出
-- 创建 Maven 项目 引入 hive-exec 3.1.3 jar包
-- 自定义函数jar包上传
add jar /home/hadoop/myudfs.jar; -- 这种上传是临时生效 永久函数需要上传到hdfs路径上
-- 创建永久函数 jar 需要上传到hive 所在的节点
-- 临时函数 temporary 关键字
CREATE FUNCTION function_name
as "com.java.func.function_name"
using jar "hdfs://hadoop01:9000/udf/myudfs.jar";
-- hdfs dfs -ls hdfs://hadoop01:9000/hivedata/db_hive1/
-- 分区表
-- 分区表,动态分区
drop table db_hive1.t2a_trans purge;
CREATE EXTERNAL TABLE t2a_trans (
TR_ID STRING COMMENT '交易流水号',
CUST_ID STRING COMMENT '用户ID',
TR_AMT DECIMAL(10,2) COMMENT '交易金额',
product_code STRING COMMENT '产品代码'
--data_dt DATE COMMENT '交易创建时间'
)
COMMENT '交易流水事实表'
PARTITIONED BY (tr_dt date) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' -- 使用JSON SerDe
--ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS ORC; -- 推荐使用ORC或Parquet等列式存储格式
truncate table db_hive1.t2a_trans;
INSERT INTO db_hive1.t2a_trans
(TR_ID, CUST_ID, TR_AMT, product_code, tr_dt)
SELECT cast(cast(r.id as bigint) as string) as tr_id,
r.sname as cust_id,
round(rand(),2) as tr_cmt,
r.ctfid as producet_code,
to_date(substr(r.version,1,4)||'-01-01','yyyy-MM-dd') as tr_dt
FROM db_hive1.room4 r
where r.version is not null
--and r.sname like '%鬼%'
;
select tr_dt,count(*) from db_hive1.t2a_trans t where t.tr_dt>=date'2010-01-01' group by tr_dt order by t.tr_dt limit 100;
SELECT * FROM db_hive1.t2a_trans;
select * from test_db.room2_view;
-- 窗口函数 缺省 省略不写
show create table db_hive1.t2a_trans;
SHOW PARTITIONS t2a_trans;
-- 增加一个分区
ALTER TABLE t2a_trans ADD PARTITION (tr_dt = DATE "2022-11-01");
ALTER TABLE t2a_trans ADD PARTITION (tr_dt = DATE "2022-11-01") PARTITION (tr_dt = DATE "2022-11-02");
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01");
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01"),PARTITION (tr_dt = DATE "2022-11-02");
TRUNCATE TABLE t2a_trans PARTITION (tr_dt = DATE "2022-11-01");
SELECT * FROM t2a_trans where tr_dt = DATE "2012-01-01";
-- 修复分区
alter table t2a_trans add partition (tr_dt = DATE "2022-11-01");
-- 手动删除 hdfs路径上的数据 元数据不会发生变化,次数应 删除对应分区 保持一致
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01");
-- 通用修复元数据的方式 msck
msck repair table t2a_trans add partitions ; -- 自动添加hdfs数据存在,元数据缺失的情况,新增元数据
msck repair table t2a_trans drop partitions ;-- 自动删除 路径不存在 元数据存在的情况 删除元数据
msck repair table t2a_trans sync partitions ; -- 保证hdfs路径和元数据自动同步
msck repair table t2a_trans ; -- 默认值自动同步元数据和hdfs路径
-- 分区表之 二级分区
DROP TABLE db_hive1.t2a_trans_hst PURGE;
-- EXTERNAL
CREATE TABLE db_hive1.t2a_trans_hst (
TR_ID STRING COMMENT '交易流水号',
CUST_ID STRING COMMENT '用户ID',
TR_AMT DECIMAL(10,2) COMMENT '交易金额',
product_code STRING COMMENT '产品代码'
--data_dt DATE COMMENT '交易创建时间'
)
COMMENT 'jiaoyiliushuibiao'
PARTITIONED BY (year_dt string,month_dt string) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' -- 使用JSON SerDe
ROW FORMAT DELIMITED FIELDS TERMINATED BY "|^|"
STORED AS textfile -- 推荐使用ORC或Parquet等列式存储格式
--location "/hivedata/db_hive1/t2a_trans_hst"
;
desc extended db_hive1.t2a_trans_hst;
-- spark-thriftserver 不支持执行 直接使用 hive jdbc
truncate table db_hive1.t2a_trans_hst;
-- SET hive.exec.dynamic.partition=true; -- 开启动态分区
-- SET hive.exec.dynamic.partition.mode=nonstrict;
insert into db_hive1.t2a_trans_hst PARTITION (year_dt,month_dt)
select A.tr_id,A.CUST_ID,A.TR_AMT,A.product_code ,year(A.tr_dt) as year_dt, month(A.tr_dt) as month_dt
from db_hive1.t2a_trans A
where year(A.tr_dt) is not null and month(A.tr_dt) is not null
and a.cust_id is not null
;
select year_dt,count(*) from db_hive1.t2a_trans_hst group by year_dt order by count(1) desc limit 100;
select * from db_hive1.t2a_trans_hst T WHERE instr(T.CUST_ID,"吴")>0 AND T.CUST_ID NOT LIKE '吴%';
select * from db_hive1.t2a_trans_hst T where t.CUST_ID is not null;
drop table db_hive1.stu_buck purge ;
// 分桶表 后续优化有用到
create table stu_buck(
id string,
name string
)clustered by (id) -- 按照哈希值进行分区 也就是分桶表
sorted by (id)
into 4 buckets -- 4个文件块
--row format delimited fields terminated by "|@|"
STORED AS ORC
;
desc extended stu_buck ;
select * from db_hive1.stu_buck;
insert into stu_buck
(id,name)
select t.tr_id,t.CUST_ID from db_hive1.t2a_trans t;
select * from db_hive1.stu_buck;
load data local inpath "文件路径"
into table db_hive1.stu_buck;
-- 分桶排序表
create table stu_buck(
id string,
name string
)clustered by (id) -- 按照哈希值进行分区 也就是分桶表
sorted by (id)
into 4 buckets -- 4个文件块
row format delimited fields terminated by "^"
;
SELECT * FROM db_hive1.stu_buck;
select t.gender,sum(t.cnt) from test_db.room2_gender_out t group by t.gender order by sum(t.cnt) desc;
-- 压缩算法
-- textFile orc paquert squefile
// orc 列式存储
show create table db_hive1.room4;
SELECT * FROM db_hive1.room4;
-- 执行计划
EXPLAIN EXTENDED select t.gender,sum(t.cnt)
from test_db.room2_gender_out t
group by t.gender order by sum(t.cnt) desc LIMIT 100;
-- JOIN 算法
WITH TEMP AS (SELECT gender, sum(cnt) AS CNT
FROM test_db.room2_view
where length(trim(gender)) > 0
GROUP BY gender)
select /*+ MAPJOIN(p) */ p.gender, count(distinct b.sname) as sname_cnt
from TEMP p
inner join db_hive1.room4 b on p.gender = b.gender
group by p.gender
order by count(distinct b.sname) desc;
-- map join
explain extended
WITH TEMP AS (SELECT gender, sum(cnt) AS CNT
FROM test_db.room2_view
where length(trim(gender)) > 0
GROUP BY gender)
select /*+ MAPJOIN(p) */ p.gender, count(distinct b.sname) as sname_cnt
from TEMP p
inner join db_hive1.room4 b on p.gender = b.gender
group by p.gender
order by count(distinct b.sname) asc;
select w.gender,sum(cnt) as sum_cnt from test_db.room2_view w group by w.gender order by sum_cnt desc limit 100;
-- 优化方法
-- common join
-- 正常关联逻辑
-- mapjoin
--- 通过广播小表到内存中进行优化
-- bucket map join
-- 通过分桶 广播大表到内存中进行优化
-- sort merge bucket map join
-- 通过分桶 成 倍数的关系 进行优化
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.auto.convert.sortmerge.jjoin=true;
-- 聚合 数据倾斜
--解决方案一:类似于分组聚合 key值分配不均匀,在map端聚合 解决数据倾斜问题
set hive.map.aggr = true; -- 开启map端聚合
set hive.map.agg.hash.min.reduction=0.5; -- 聚合前后条数比较
set hive.groupby.mapaggr.checkinterval=100000;-- 聚合条数
-- 方案二:
-- skew-groupby 启用分组聚合数据倾斜优化
set hive.groupby.skewindata = true;
-- 启动两个mr任务,将按照随机数进行发生到reduce 进行聚合 在 发到 第二哥Mr进行 分组聚合
-- join表间关联 数据倾斜
--方案一 使用 map join 解决数据倾斜 大表 小表关联 合适场景
--方案二 skew join 为倾斜的 大 key 单独 使用 一个 map 进行 处理 大表和大表关联
-- 处理一对多 数据倾斜
-- 自动检测 大的倾斜 key 自动走 map join 的任务方式进行关联
set hive.optimize.skewjoin = true;
set hive.skewjoin.key = 100000; -- 自动检测 行数超过 10W 就会自动 启动一个 mr 任务处理
-- 方案三 手动调整sql 大表和大表关联 其中有一个表数据 key 倾斜
-- 把关联字段 使倾斜的表 对应字段 用子查询 增加 随机 key 进行打散
-- concat(key_col,'_',cast(rand()*n as int)) -- 均匀打撒
-- 对应字段唯一的表,不倾斜的表进行 扩容
-- concat(key_col,'_',1)
-- union all concat(key_col,'_',2)
-- union all concat(key_col,'_',n)
-- 开启CBO优化, 基于成本最优化 大中小 表 优先 中小表关联 在走大表关联
set hive.cbo.enable = true;
set hive.auto.convert.join = false;
-- 开启谓词下推 优化 sql语句当中的 where 过滤操作
set hive.optimize.ppd = true; -- 开启参数 默认开启
// cbo 优化 也会完成一部分的 谓词下推
-- 矢量化计算原理
set hive.vectorized.execution.enabled=true;
-- 本地模式
set hive.exec.mode.loca.auto = true;
-- 并行执行:正常都stage 并行执行,mr 不依赖的这种中情况 进行并行执行
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number = 8;
-- 严格模式,防止高位参数 全局设置为 true
set hive.strict.checks.no.partition.filter=true;
-- 设置 order by 没有 limit 的操作,控制全局排序
set hive.strict.checks.orderby.no.limit=true;
-- 笛卡尔积控制
set hive.strict.checks.cartesian.product=true;
explain FORMATTED
WITH TEMP AS (SELECT gender, sum(cnt) AS CNT
FROM test_db.room2_view
where length(trim(gender)) > 0
GROUP BY gender)
select /*+ MAPJOIN(p) */ p.gender, count(distinct b.sname) as sname_cnt
from TEMP p
inner join db_hive1.room4 b on p.gender = b.gender
group by p.gender
order by count(distinct b.sname) asc limit 100;
自动化学习。

浙公网安备 33010602011771号