hive 命令大全 hive优化 数据倾斜 分区分桶

SELECT T.sname,T.ctfid,T.gender,t.address,
       count(*) OVER(PARTITION BY T.sname) AS FM_CNT
       FROM test_db.room3 T WHERE  T.address like "%北京%" AND instr(T.ctfid,'310')>0
ORDER BY FM_CNT DESC;


select gender,sum(cnt) as cnt from test_db.room2_view where length(trim(gender))>0 group by gender order by cnt desc;


desc test_db.room2;

-- DROP TABLE default.room2_gender_out PURGE ;

-- 删除数据库
//DROP DATABASE db_hive1;

// 1.创建数据库
CREATE DATABASE IF NOT EXISTS db_hive1
    COMMENT "hive学习教程数据库" --数据库备注
    LOCATION '/hivedata/db_hive1' --数据库存储hdfs路径
    WITH DBPROPERTIES ("create_dt"="2025-12-07","create_user"="hadoop") --数据库信息k,v
;

//查看数据库
SHOW DATABASES LIKE "*db*";

//查看数据库的信息
describe DATABASE EXTENDED db_hive1;

//查看更多信息
DESC DATABASE EXTENDED db_hive1;

//2.修改数据库
ALTER DATABASE db_hive1 set DBPROPERTIES ("create_dt"="2025-12-08","create_user"="wwwdp") ;
ALTER DATABASE db_hive1 set LOCATION '/hivedata/db_hive1'; --数据库存储hdfs路径
ALTER DATABASE db_hive1 set OWNER USER hadoop; -- 修改数据库归属用户


-- 删除数据库
//DROP DATABASE db_hive1 RESTRICT ; --数据库不为空删除失败
//DROP DATABASE db_hive1 CASCADE ; -- 将表和库一并 删除

//切换数据库
use test_db;

// 3 建表操作
-- 内部表
drop table db_hive1.tb_test1 purge ;
CREATE TABLE db_hive1.tb_test1
(dat_dt date COMMENT "DATE",
idky bigint COMMENT "主键",
amt decimal(30,4) COMMENT "AMT",
name varchar(4000) COMMENT "NAME",
work_list array<string> COMMENT "ARRAY",
addr_map MAP<string,string> COMMENT "Map"
)
COMMENT "nei bu biao"
--location "/hivedata/db_hive1/tb_test1"
-- row format delimited
-- ROW FORMAT SERDE ''
STORED AS orc -- textfile/orc/parquet
TBLPROPERTIES ("create_dt"="20251207")
;
INSERT INTO db_hive1.tb_test1 VALUES
(date '2023-10-01', 123456789, 1234.5678, 'John Doe',
 array('work1', 'work2'), map('city', 'New York', 'country', 'USA'));

select * from db_hive1.tb_test1 t;
select t.dat_dt,t.idky,t.name,
       t.work_list[0],
       t.work_list[1],
       t.addr_map["city"],
       t.addr_map["country"]
       from db_hive1.tb_test1 t;

-- 查看建表信息
desc extended db_hive1.tb_test1;


//会话临时表
CREATE TEMPORARY TABLE db_hive1.tb_test2
(dat_dt date,
idky int,
name string)
COMMENT "nei bu biao"
STORED AS ORC -- textfile/orc/parquet
;
-- select * from db_hive1.tb_test2;

--外部表
CREATE EXTERNAL TABLE db_hive1.tb_test3
(dat_dt date,
idky int,
name string)
COMMENT "nei bu biao"
location "/hivedata/db_hive1/tb_test3"
STORED AS orc -- textfile/orc/parquet
TBLPROPERTIES ("tab"="tb_test3")
;

desc extended db_hive1.tb_test3;


select date '2024-10-01' dat;



-- 数据类型
select cast('908.342' as decimal(30,4)) as vles;


-- 建表 like语法
CREATE TABLE db_hive1.room4 like db_hive1.tb_test1;

-- 建表 as语法
drop table  db_hive1.room4 purge ;
create table db_hive1.room4
    stored as parquet
    as
    select * from test_db.room3;

use db_hive1;

select * from db_hive1.room4 r where r.address like "%剑河%";



-- 查看 ddl 建表语句
show create table db_hive1.tb_test1;



SELECT T.sname,T.ctfid,T.gender,t.address,
       count(*) OVER(PARTITION BY T.sname) AS FM_CNT
       FROM db_hive1.room4 T WHERE  T.address like "%北京%" AND instr(T.ctfid,'310')>0
ORDER BY FM_CNT DESC;


-- 建表指定分隔符
CREATE TABLE db_hive1.tb_test5
(dat_dt date,
idky int,
name string)
COMMENT "nei bu biao"
location "/hivedata/db_hive1/tb_test5"
ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS orc -- textfile/orc/parquet
TBLPROPERTIES ("tab"="tb_test3")
;

-- 可以使用 hdfs 提交数据值 tb_test5 要求 input.txt 每行字段直接分隔符苇 |@|
-- hdfs dfs -put input.txt /hivedata/db_hive1/tb_test5



-- 创建json对接hive表可以直接 put就可以插入数据
drop table db_hive1.teacher;
create table db_hive1.teacher(
    name string comment "name",
    friend array<varchar(256)> comment "fred",
    students map<string,int> comment "students",
    adderss struct<city:string,amt:decimal(30,4),room:array<int>> comment "adderss"
)
--row format serde 'org.apache.hadoop.hive.serde2.JsonSerDe'  -- 指定 json
--location "/hivedata/db_hive1/teacher"
;

INSERT INTO db_hive1.teacher
select "hadoop1" as name,
       array("fdaf","fdafdfff") as friend,
       map("city",64) as students,
       struct("guiyangshi",123.543,array(9,0,32)) as adderss;



INSERT INTO db_hive1.teacher
values ("hadoop1",
        array("fdaf", "fdaffdas"),
        map("city", 64),
        struct("guiyangshi", 123.543, `array`(1,2,3,4)));


select * from teacher;

// 查看表信息
desc extended teacher;
desc formatted teacher;
show tables in db_hive1 "teacher" ;

//修改表的语法
drop table stu;
create table db_hive1.stu
(id int,
name string)
row format delimited fields terminated by ",";

ALTER TABLE stu RENAME TO  stu1;
//添加字段
ALTER TABLE stu add columns(gender string);
-- 修改字段
set hive.metastore.disallow.incompatible.col.type.changes=false;

alter table stu1 change column gender  gend string after name;

alter table db_hive1.stu1 replace columns(id int,name string,gend string);

-- 清空表只对内部表有用,对外部表无效
truncate table db_hive1.stu;

desc stu;

-- 二、 数据增删改查  DML
-- 本地导入  OVERWRITE 覆盖的意思,不加则 追加。 inpath 对应的local 节点是 你服务启动的 节点
LOAD DATA LOCAL INPATH "/home/hadoop/stu.txt"  INTO TABLE db_hive1.stu;
-- HDFS分布式文件路径
LOAD DATA INPATH "" OVERWRITE INTO TABLE db_hive1.stu;

-- 分区表
-- LOAD DATA INPATH "" OVERWRITE INTO TABLE db_hive1.stu PARTITION (....);

select * from db_hive1.stu;

//多条数据插入
insert into db_hive1.stu
values(1,"faf"),
      (2,"test");

-- 把查询sql的数据 写入到本地对应路径的
insert overwrite local directory "/home/hadoop/data/stu"
    row format serde "org.apache.hadoop.hive.serde2.JsonSerDe"
select id,name from db_hive1.stu;

set mapreduce.job.reduces=1; -- 设置为导出到一个文件里面
insert overwrite local directory "/home/hadoop/data/stu"
    row format delimited --serde "org.apache.hadoop.hive.serde2.JsonSerDe"
select id,name from db_hive1.stu;

-- 导出和导入

EXPORT TABLE db_hive1.stu to "/hivedata/stuout"; -- 导出数据及元数据,到hdfs 文件系统上
IMPORT /*external*/ TABLE stu from "hivedata/stuout"; -- 导入表元数据及数据 external 声明外部表


// 三、hive查询语句
//SELECT * FROM stu WHERE ID = 1 GROUP BY id HAVING count(*)>1 ORDER BY id LIMIT 1,10;

SELECT * FROM test_db.room2_view limit 4;

SELECT * FROM test_db.room2_view w where not w.gender ='12';

SELECT * FROM db_hive1.stu T WHERE T.id BETWEEN 0 AND 2;

SELECT * FROM db_hive1.stu T WHERE nvl(T.id,1) IN(0,1);

SELECT * FROM db_hive1.stu T where T.name like "%f%";

SELECT * FROM db_hive1.stu T where T.name regexp "正则表达式";
-- 逻辑语句 and or not
-- 聚合函数 count(*) avg() max() min(*)


-- 本地模式
// set mapreduce.framework.name local;


select * from stu;



SELECT
       row_number() over (order by t.ctfid) as rn,
       T.* FROM db_hive1.room4  T LIMIT 1000;



SELECT t.ctftp,count(*) as cnt FROM db_hive1.room4 t where t.ctftp is not null group by t.ctftp order by count(*) desc;


-- 排序语法 order by 语法 后面尽量跟limit 这个直接优化map端 不会导致 内存溢出
SELECT t.ctftp,count(*) as cnt FROM db_hive1.room4 t
where t.ctftp is not null group by t.ctftp order by count(*) DESC limit 100;

set mapreduce.job.reduces;

SELECT count(1) FROM test_db.room3;

WITH TEMP AS (
    SELECT t.ctftp,count(*) as cnt FROM db_hive1.room4 t
where t.ctftp is not null group by t.ctftp order by count(*) DESC limit 100
)
select * from TEMP where ctftp NOT like 'I%';


-- ORDER BY  和 sort by 的区别,order by 全局排序, sort by 指的是 map 到 reduce 端的排序,它只保证单个reduce 排序。
WITH result as (
   select * from db_hive1.room4
)
  ,TEMP AS (
    SELECT t.ctftp,count(*) as cnt FROM result t
where t.ctftp is not null group by t.ctftp order by count(*) DESC limit 100
)
SELECT * FROM TEMP SORT BY ctftp DESC ; -- 只对单个分区进行排序

select * from test_db.room3;

-- 函数

show functions ;


DESC FUNCTION substring;

DESC FUNCTION EXTENDED substring;


-- UDF 一进一出
SELECT round(3.342,2) AS RS;

SELECT substring("12345678",3,2);
SELECT substr("12345678",3,2);
SELECT substr("12345678",3);

-- 时间字符串转为时间字符串
SELECT unix_timestamp();-- 当前时间戳
SELECT unix_timestamp("2025-12-10 22:29:30","yyyy-MM-dd HH:mm:ss");


DESC FUNCTION unix_timestamp;
-- 时间戳转时间字符串
SELECT from_unixtime(unix_timestamp()) dat;

SELECT from_unixtime(1765423770,'yyyy-MM-dd HH:mm:ss') dat;



SELECT from_utc_timestamp(`cast`(1765378139 AS BIGINT)*1000,'GMT+8');


SELECT current_date();

select date_format(current_timestamp(),'yyyy-MM-dd HH:mm:ss');

SELECT hour(current_timestamp());

select datediff();

select date_add('2025-12-10',1); -- 加天数
select date_sub('2025-12-10',1); -- 减天数


SELECT date_format(date '2025-12-10','yyyy-MM-dd');

-- 条件控制语句
select case when 1=2 then 0 else 1 end ;

select `if`(1=6,1,0);



select * from test_db.room2_view T WHERE T.gender IS NOT NULL;

select * from db_hive1.room4;


-- 集合函数


SELECT `array`(1,2,34);

select `map`("city","gzhou","addr","adfas","rs",1);

-- 判断元素是否存在 数组里面
select array_contains(array(1,2,34),1);

select sort_array(array(1,2,34,3,5,7));

select size(array(1,2,34,3,5,7));

select map_keys(map("city","gzhou","addr","adfas","rs",1));

select map_values(map("city","gzhou","addr","adfas","rs",1));

select struct("fdafj","fafda","fdaf");

select named_struct("key1",1,"cl1","fdaf");

select month(`current_date`());

select date '2025-12-11';
select datediff(date '2025-12-11' , date '2024-11-11' ) as dat;

select decode(1,1,1,0);

desc function extended decode;

SELECT decode(2, 1, 'Southlake', 2, 'San Francisco', 3, 'New Jersey', 4, 'Seattle', 'Non domestic');


-- 高级集合函数

SELECT
    collect_list(gender) a_list, -- 把多行聚合为一行
    collect_set(cnt) as a_set -- 把多行聚合为一行 去重
    FROM test_db.room2_view WHERE length(trim(gender))>0;

-- UDTF 一进多出

SELECT explode(ARRAY("21",'22',324));

SELECT posexplode(ARRAY("21",'22',324));


SELECT inline(ARRAY(struct('DFS',1,"FDFSFS",1.098)));


SELECT T.idky,posexplode(T.addr_map) FROM db_hive1.tb_test1 T;


select a.name,inline(`array`(a.adderss)),a.adderss from teacher a;



select A.name,explode(A.friend) as fd
       from db_hive1.teacher A ;

select A.name,A.friend,
       fd
       from db_hive1.teacher A
    lateral view explode(a.friend) tem as fd;
;

select t.adderss["room"] from db_hive1.teacher t;


select posexplode(split("wzp,jq,xiaowang",','));


-- 窗口函数
SELECT * FROM (
select
    R.sname,
    r.ctfid,
    count(*) over (partition by r.ctfid) cnt
    from test_db.room3 r WHERE R.ctfid IS NOT NULL
        ) W WHERE W.CNT >1
;

-- 如何定义窗口范围,基于行的 和基于值的

select
    gender,cnt,
    sum(cnt) over (order by v.gender desc rows between 2 preceding and current row ) as lj
from test_db.room2_view v where length(trim(v.gender))>0
--order by v.gender asc--limit 100
;
-- 累计求和
select
    gender,cnt,
    sum(cnt) over (order by v.gender desc
        rows between unbounded preceding and current row ) as lj
from test_db.room2_view v where length(trim(v.gender))>0
;

-- 基于值的 窗口函数 order by 不是排序, 而是居于 那个字段对应的值进行划分窗口
-- order by [column] rows/range between 起点   and  终点

-- 对每一个 gender 分组值进行 累和
select
    gender,cnt,
    sum(cnt) over (partition by v.gender order by v.cnt asc
        rows between unbounded preceding and current row ) as lj
from test_db.room2_view v where length(trim(v.gender))>0
;

select * from test_db.room2;

select * from test_db.room2_gender_out;

select * from test_db.room3;

drop table db_hive1.room4 purge;

create table db_hive1.room4
   -- clustered by (id)  -- 按照哈希值进行分区 也就是分桶表
   -- sorted by (id)
   -- into 3 buckets  -- 4个文件块
    stored as textfile row format delimited fields terminated by "^|"
    as select * from test_db.room3;


select * from db_hive1.room4;


SELECT explode(T.friend) AS friend,inline(array(t.adderss)) FROM db_hive1.teacher T;


desc function extended inline;


-- /*+ BROADCAST(a) */

select /*+ BROADCAST(a) */ a.gender,count(1) as cnts from test_db.room2_gender_out a
            left join db_hive1.room4 b on a.gender = b.gender
    where length(trim(a.gender))>0
    group by a.gender order by count(*) desc limit 10
;

SELECT * FROM test_db.room2;
SELECT * FROM test_db.room3;
SELECT * FROM db_hive1.room4;

desc extended room4;

SELECT t.id,count(*) as cnt FROM test_db.room2 t group by t.id having count(*)>1 limit 100;

SELECT * FROM test_db.room2 r where r.id is null;



select /*+ BROADCAST(a) */ a.gender,count(1) as cnts from test_db.room2_gender_out a
            left join test_db.room2 b on a.gender = b.gender
    where length(trim(a.gender))>0
    group by a.gender order by count(*) desc limit 10
;

-- 排名 有order by 从第一行往后累计 无order by 累计全部值
SELECT T.CUST_ID,count(*) as cnt,
       row_number() over (order by count(*) desc) as rn,
       rank() over (order by count(*) desc) as rn1,
       dense_rank() over (order by count(*) desc) as rn2
       FROM db_hive1.t2a_trans T GROUP BY T.CUST_ID;

SELECT distinct T.CUST_ID,
       sum(t.TR_AMT) over () as rn  -- 等于 全局求个
       FROM db_hive1.t2a_trans T ;


-- 获取 上下 的值  LEAD LAG

WITH TEMP AS (SELECT T.CUST_ID,count(*) as cnt,
       row_number() over (order by count(*) desc) as rn
       FROM db_hive1.t2a_trans T GROUP BY T.CUST_ID)
SELECT  P.*,
        LEAD(P.rn) over (order by p.rn asc) AS TOP_NUM,  -- 获取 rn排序 下一行的值
        LAG(P.rn,1,0) OVER(order by p.rn asc) AS LOW_NUM --获取排序上一行的值
        FROM TEMP P;



-- 窗口获取 first_value 和 last_value 第一和最后一行的值



WITH TEMP AS (
SELECT T.CUST_ID,count(*) as cnt,
       row_number() over (order by count(*) desc) as rn
       FROM db_hive1.t2a_trans T GROUP BY T.CUST_ID)
SELECT  P.*,
       -- LEAD(P.rn) over (order by p.rn asc) AS TOP_NUM,  -- 获取 rn排序 下一行的值
       -- LAG(P.rn,1,0) OVER(order by p.rn asc) AS LOW_NUM, --获取排序上一行的值
        first_value(p.rn,false) over (order by p.rn asc ) as first_num,  -- 第二参数 是否需要跳过null值
        last_value(p.cnt,false) over (order by p.rn asc
         --   rows between current row and unbounded following
            ) as last_num  -- 第二参数 是否需要跳过null值
FROM TEMP P ;



-- hive基础当中 最后一次知识点 自定义函数
-- udf 一进一出 udaf 一进多出 udtf 多进一出

-- 创建 Maven 项目 引入 hive-exec 3.1.3 jar包

-- 自定义函数jar包上传
add jar /home/hadoop/myudfs.jar; --  这种上传是临时生效 永久函数需要上传到hdfs路径上

-- 创建永久函数  jar 需要上传到hive 所在的节点
-- 临时函数 temporary 关键字
CREATE FUNCTION function_name
as "com.java.func.function_name"
using jar "hdfs://hadoop01:9000/udf/myudfs.jar";


--  hdfs dfs -ls hdfs://hadoop01:9000/hivedata/db_hive1/




-- 分区表

-- 分区表,动态分区
drop table db_hive1.t2a_trans purge;

CREATE EXTERNAL TABLE t2a_trans (
    TR_ID STRING COMMENT '交易流水号',
    CUST_ID STRING COMMENT '用户ID',
    TR_AMT DECIMAL(10,2) COMMENT '交易金额',
    product_code STRING COMMENT '产品代码'
    --data_dt DATE COMMENT '交易创建时间'
)
COMMENT '交易流水事实表'
PARTITIONED BY (tr_dt date) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'  -- 使用JSON SerDe
--ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS ORC; -- 推荐使用ORC或Parquet等列式存储格式

truncate table db_hive1.t2a_trans;


INSERT INTO db_hive1.t2a_trans
(TR_ID, CUST_ID, TR_AMT, product_code, tr_dt)
SELECT cast(cast(r.id as bigint) as string) as tr_id,
       r.sname as cust_id,
       round(rand(),2) as tr_cmt,
       r.ctfid as producet_code,
       to_date(substr(r.version,1,4)||'-01-01','yyyy-MM-dd') as tr_dt
       FROM db_hive1.room4 r
    where r.version is not null
    --and r.sname like '%鬼%'
;

select tr_dt,count(*) from db_hive1.t2a_trans t where t.tr_dt>=date'2010-01-01' group by tr_dt order by t.tr_dt limit 100;

SELECT * FROM db_hive1.t2a_trans;

select * from test_db.room2_view;

-- 窗口函数 缺省 省略不写

show create table db_hive1.t2a_trans;

SHOW PARTITIONS t2a_trans;

-- 增加一个分区
ALTER TABLE t2a_trans ADD PARTITION (tr_dt = DATE "2022-11-01");
ALTER TABLE t2a_trans ADD PARTITION (tr_dt = DATE "2022-11-01") PARTITION (tr_dt = DATE "2022-11-02");

alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01");
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01"),PARTITION (tr_dt = DATE "2022-11-02");


TRUNCATE TABLE t2a_trans PARTITION (tr_dt = DATE "2022-11-01");

SELECT * FROM t2a_trans where tr_dt = DATE "2012-01-01";

-- 修复分区
alter table t2a_trans add partition (tr_dt = DATE "2022-11-01");

-- 手动删除 hdfs路径上的数据 元数据不会发生变化,次数应 删除对应分区 保持一致
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01");

-- 通用修复元数据的方式 msck
msck  repair table t2a_trans add partitions ; -- 自动添加hdfs数据存在,元数据缺失的情况,新增元数据
msck repair table t2a_trans drop partitions ;-- 自动删除 路径不存在 元数据存在的情况 删除元数据
msck repair table t2a_trans sync partitions ; -- 保证hdfs路径和元数据自动同步

msck repair table t2a_trans ; -- 默认值自动同步元数据和hdfs路径


-- 分区表之 二级分区


DROP  TABLE db_hive1.t2a_trans_hst PURGE;
-- EXTERNAL
CREATE  TABLE db_hive1.t2a_trans_hst (
    TR_ID STRING COMMENT '交易流水号',
    CUST_ID STRING COMMENT '用户ID',
    TR_AMT DECIMAL(10,2) COMMENT '交易金额',
    product_code STRING COMMENT '产品代码'
    --data_dt DATE COMMENT '交易创建时间'
)
COMMENT 'jiaoyiliushuibiao'
PARTITIONED BY (year_dt string,month_dt string) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'  -- 使用JSON SerDe
ROW FORMAT DELIMITED FIELDS TERMINATED BY "|^|"
STORED AS textfile -- 推荐使用ORC或Parquet等列式存储格式
--location "/hivedata/db_hive1/t2a_trans_hst"
;


desc extended db_hive1.t2a_trans_hst;

-- spark-thriftserver 不支持执行  直接使用 hive jdbc
truncate table db_hive1.t2a_trans_hst;



-- SET hive.exec.dynamic.partition=true; -- 开启动态分区
-- SET hive.exec.dynamic.partition.mode=nonstrict;


insert into db_hive1.t2a_trans_hst PARTITION (year_dt,month_dt)
select A.tr_id,A.CUST_ID,A.TR_AMT,A.product_code ,year(A.tr_dt) as year_dt, month(A.tr_dt) as month_dt
       from db_hive1.t2a_trans A
        where year(A.tr_dt) is not null and month(A.tr_dt) is not null
        and a.cust_id is not null
;


select year_dt,count(*) from db_hive1.t2a_trans_hst group by year_dt order by count(1) desc limit 100;

select * from db_hive1.t2a_trans_hst T WHERE instr(T.CUST_ID,"吴")>0 AND T.CUST_ID NOT LIKE '吴%';

select * from db_hive1.t2a_trans_hst T where t.CUST_ID is not null;


drop table db_hive1.stu_buck purge ;
// 分桶表 后续优化有用到
create table stu_buck(
    id string,
    name string
)clustered by (id)  -- 按照哈希值进行分区 也就是分桶表
    sorted by (id)
into 4 buckets  -- 4个文件块
--row format delimited fields terminated by "|@|"
STORED AS ORC
;

desc extended stu_buck ;

select * from db_hive1.stu_buck;

insert into stu_buck
(id,name)
select t.tr_id,t.CUST_ID from db_hive1.t2a_trans t;

select * from db_hive1.stu_buck;

load data local inpath "文件路径"
into table db_hive1.stu_buck;


-- 分桶排序表
create table stu_buck(
    id string,
    name string
)clustered by (id)  -- 按照哈希值进行分区 也就是分桶表
    sorted by (id)
into 4 buckets  -- 4个文件块
row format delimited fields terminated by "^"
;

SELECT * FROM db_hive1.stu_buck;

select t.gender,sum(t.cnt) from test_db.room2_gender_out t  group by t.gender order by sum(t.cnt) desc;


-- 压缩算法
-- textFile orc paquert squefile
// orc 列式存储

show create table db_hive1.room4;



SELECT * FROM db_hive1.room4;

-- 执行计划
EXPLAIN EXTENDED select t.gender,sum(t.cnt)
                 from test_db.room2_gender_out t
                 group by t.gender order by sum(t.cnt) desc LIMIT 100;



-- JOIN 算法
WITH TEMP AS (SELECT gender, sum(cnt) AS CNT
              FROM test_db.room2_view
              where length(trim(gender)) > 0
              GROUP BY gender)
select /*+ MAPJOIN(p) */ p.gender, count(distinct b.sname) as sname_cnt
from TEMP p
         inner join db_hive1.room4 b on p.gender = b.gender
group by p.gender
order by count(distinct b.sname) desc;


-- map join
explain extended
WITH TEMP AS (SELECT gender, sum(cnt) AS CNT
              FROM test_db.room2_view
              where length(trim(gender)) > 0
              GROUP BY gender)
select /*+ MAPJOIN(p) */ p.gender, count(distinct b.sname) as sname_cnt
from TEMP p
         inner join db_hive1.room4 b on p.gender = b.gender
group by p.gender
order by count(distinct b.sname) asc;



select w.gender,sum(cnt) as sum_cnt from test_db.room2_view w group by w.gender order by sum_cnt desc limit 100;

-- 优化方法

-- common join
-- 正常关联逻辑

-- mapjoin
--- 通过广播小表到内存中进行优化

-- bucket map join
-- 通过分桶 广播大表到内存中进行优化

-- sort merge bucket map join
-- 通过分桶 成 倍数的关系 进行优化
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.auto.convert.sortmerge.jjoin=true;


-- 聚合 数据倾斜
--解决方案一:类似于分组聚合 key值分配不均匀,在map端聚合 解决数据倾斜问题
set hive.map.aggr = true; -- 开启map端聚合
set hive.map.agg.hash.min.reduction=0.5; -- 聚合前后条数比较
set hive.groupby.mapaggr.checkinterval=100000;-- 聚合条数

-- 方案二:
-- skew-groupby 启用分组聚合数据倾斜优化
set hive.groupby.skewindata = true;
-- 启动两个mr任务,将按照随机数进行发生到reduce 进行聚合 在 发到 第二哥Mr进行 分组聚合

-- join表间关联 数据倾斜
--方案一 使用 map join 解决数据倾斜  大表 小表关联 合适场景
--方案二 skew join 为倾斜的 大 key 单独 使用 一个 map 进行 处理  大表和大表关联
-- 处理一对多 数据倾斜
-- 自动检测 大的倾斜 key 自动走 map join 的任务方式进行关联
set hive.optimize.skewjoin = true;
set hive.skewjoin.key = 100000; -- 自动检测 行数超过 10W 就会自动 启动一个 mr 任务处理
-- 方案三 手动调整sql 大表和大表关联 其中有一个表数据 key 倾斜
 -- 把关联字段 使倾斜的表 对应字段 用子查询 增加 随机 key 进行打散
--  concat(key_col,'_',cast(rand()*n as int)) -- 均匀打撒
-- 对应字段唯一的表,不倾斜的表进行 扩容
--    concat(key_col,'_',1)
--  union all concat(key_col,'_',2)
--  union all concat(key_col,'_',n)

-- 开启CBO优化, 基于成本最优化  大中小 表 优先  中小表关联 在走大表关联
set hive.cbo.enable = true;
set hive.auto.convert.join = false;

-- 开启谓词下推 优化 sql语句当中的 where 过滤操作
set hive.optimize.ppd = true; -- 开启参数 默认开启
// cbo 优化 也会完成一部分的 谓词下推

-- 矢量化计算原理
set hive.vectorized.execution.enabled=true;

-- 本地模式
set hive.exec.mode.loca.auto = true;

-- 并行执行:正常都stage 并行执行,mr 不依赖的这种中情况 进行并行执行
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number = 8;

-- 严格模式,防止高位参数 全局设置为 true
set hive.strict.checks.no.partition.filter=true;

-- 设置 order by 没有 limit 的操作,控制全局排序
set hive.strict.checks.orderby.no.limit=true;

-- 笛卡尔积控制
set hive.strict.checks.cartesian.product=true;



explain FORMATTED
WITH TEMP AS (SELECT gender, sum(cnt) AS CNT
              FROM test_db.room2_view
              where length(trim(gender)) > 0
              GROUP BY gender)
select /*+ MAPJOIN(p) */ p.gender, count(distinct b.sname) as sname_cnt
from TEMP p
         inner join db_hive1.room4 b on p.gender = b.gender
group by p.gender
order by count(distinct b.sname) asc limit 100;

  

posted @ 2025-12-25 20:47  ARYOUOK  阅读(4)  评论(0)    收藏  举报