hive 分区表 二级分区 分区分桶表 动态分区
-- 分区表
-- 分区表,动态分区
drop table db_hive1.t2a_trans purge;
CREATE EXTERNAL TABLE t2a_trans (
TR_ID STRING COMMENT '交易流水号',
CUST_ID STRING COMMENT '用户ID',
TR_AMT DECIMAL(10,2) COMMENT '交易金额',
product_code STRING COMMENT '产品代码'
--data_dt DATE COMMENT '交易创建时间'
)
COMMENT '交易流水事实表'
PARTITIONED BY (tr_dt date) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' -- 使用JSON SerDe
--ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS ORC; -- 推荐使用ORC或Parquet等列式存储格式
truncate table db_hive1.t2a_trans;
INSERT INTO db_hive1.t2a_trans
(TR_ID, CUST_ID, TR_AMT, product_code, tr_dt)
SELECT cast(cast(r.id as bigint) as string) as tr_id,
r.sname as cust_id,
round(rand(),2) as tr_cmt,
r.ctfid as producet_code,
to_date(substr(r.version,1,4)||'-01-01','yyyy-MM-dd') as tr_dt
FROM db_hive1.room4 r
where r.version is not null
--and r.sname like '%鬼%'
;
select tr_dt,count(*) from db_hive1.t2a_trans t where t.tr_dt>=date'2010-01-01' group by tr_dt order by t.tr_dt limit 100;
SELECT * FROM db_hive1.t2a_trans;
select * from test_db.room2_view;
-- 窗口函数 缺省 省略不写
show create table db_hive1.t2a_trans;
SHOW PARTITIONS t2a_trans;
-- 增加一个分区
ALTER TABLE t2a_trans ADD PARTITION (tr_dt = DATE "2022-11-01");
ALTER TABLE t2a_trans ADD PARTITION (tr_dt = DATE "2022-11-01") PARTITION (tr_dt = DATE "2022-11-02");
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01");
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01"),PARTITION (tr_dt = DATE "2022-11-02");
TRUNCATE TABLE t2a_trans PARTITION (tr_dt = DATE "2022-11-01");
SELECT * FROM t2a_trans where tr_dt = DATE "2012-01-01";
-- 修复分区
alter table t2a_trans add partition (tr_dt = DATE "2022-11-01");
-- 手动删除 hdfs路径上的数据 元数据不会发生变化,次数应 删除对应分区 保持一致
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01");
-- 通用修复元数据的方式 msck
msck repair table t2a_trans add partitions ; -- 自动添加hdfs数据存在,元数据缺失的情况,新增元数据
msck repair table t2a_trans drop partitions ;-- 自动删除 路径不存在 元数据存在的情况 删除元数据
msck repair table t2a_trans sync partitions ; -- 保证hdfs路径和元数据自动同步
msck repair table t2a_trans ; -- 默认值自动同步元数据和hdfs路径
-- 分区表之 二级分区
DROP TABLE db_hive1.t2a_trans_hst PURGE;
-- EXTERNAL
CREATE TABLE db_hive1.t2a_trans_hst (
TR_ID STRING COMMENT '交易流水号',
CUST_ID STRING COMMENT '用户ID',
TR_AMT DECIMAL(10,2) COMMENT '交易金额',
product_code STRING COMMENT '产品代码'
--data_dt DATE COMMENT '交易创建时间'
)
COMMENT 'jiaoyiliushuibiao'
PARTITIONED BY (year_dt string,month_dt string) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' -- 使用JSON SerDe
--ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS ORC -- 推荐使用ORC或Parquet等列式存储格式
--location "/hivedata/db_hive1/t2a_trans_hst"
;
desc extended db_hive1.t2a_trans_hst;
-- spark-thriftserver 不支持执行 直接使用 hive jdbc
truncate table db_hive1.t2a_trans_hst;
-- SET hive.exec.dynamic.partition=true; -- 开启动态分区
-- SET hive.exec.dynamic.partition.mode=nonstrict;
insert into db_hive1.t2a_trans_hst PARTITION (year_dt,month_dt)
select A.tr_id,A.CUST_ID,A.TR_AMT,A.product_code ,year(A.tr_dt) as year_dt, month(A.tr_dt) as month_dt
from db_hive1.t2a_trans A
where year(A.tr_dt) is not null and month(A.tr_dt) is not null
and a.cust_id is not null
;
select year_dt,count(*) from db_hive1.t2a_trans_hst group by year_dt order by count(1) desc limit 100;
select * from db_hive1.t2a_trans_hst T WHERE instr(T.CUST_ID,"吴")>0 AND T.CUST_ID NOT LIKE '吴%';
select * from db_hive1.t2a_trans_hst T where t.CUST_ID is not null;
drop table db_hive1.stu_buck purge ;
// 分桶表 后续优化有用到
create table stu_buck(
id string,
name string
)clustered by (id) -- 按照哈希值进行分区 也就是分桶表
sorted by (id)
into 4 buckets -- 4个文件块
--row format delimited fields terminated by "|@|"
STORED AS ORC
;
desc extended stu_buck ;
select * from db_hive1.stu_buck;
insert into stu_buck
(id,name)
select t.tr_id,t.CUST_ID from db_hive1.t2a_trans t;
select * from db_hive1.stu_buck;
load data local inpath "文件路径"
into table db_hive1.stu_buck;
-- 分桶排序表
create table stu_buck(
id string,
name string
)clustered by (id) -- 按照哈希值进行分区 也就是分桶表
sorted by (id)
into 4 buckets -- 4个文件块
row format delimited fields terminated by "^"
;
SELECT * FROM db_hive1.stu_buck;
select t.gender,sum(t.cnt) from test_db.room2_gender_out t group by t.gender order by sum(t.cnt) desc;
hive-site.xml配置 动态分区和非严格模式
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive_metadata?allowPublicKeyRetrieval=true&createDatabaseIfNotExist=true&useSSL=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value> <!-- MySQL 8驱动名 -->
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/hivedata</value> <!-- HDFS存储路径 -->
</property>
<!-- HiveServer2配置 -->
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<property>
<name>hive.server2.thrift.bind.host</name>
<value>hadoop01</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://hadoop01:9083</value>
</property>
<!-- 开启动态分区功能 -->
<property>
<name>hive.exec.dynamic.partition</name>
<value>true</value>
<description>Whether or not to allow dynamic partitions in DML/DDL.</description>
</property>
<!-- 设置动态分区模式为非严格模式 -->
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
<description>In nonstrict mode all partitions are allowed to be dynamic.</description>
</property>
<property>
<name>hive.error.on.empty.partition</name>
<value>false</value>
<description>查询语句返回空时不报错</description>
</property>
<!-- 设置 Hive 执行引擎为 Tez -->
<property>
<name>hive.execution.engine</name>
<value>mr</value>
</property>
<property>
<name>spark.home</name>
<value>/opt/spark-2.4.5</value>
</property>
<!--
<property>
<name>hive.enable.spark.execution.engine</name>
<value>true</value>
</property>
<property>
<name>spark.home</name>
<value>/opt/spark-3.3.2</value>
</property>
<property>
<name>spark.master</name>
<value>yarn</value>
</property>
-->
<!--
<property>
<name>spark.master</name>
<value>yarn</value>
</property>
-->
<property>
<name>spark.yarn.jars</name>
<value>hdfs://hadoop01:9000/spark-jars/*</value>
</property>
<!--
<property>
<name>hive.exec.parallel</name>
<value>true</value>
<description>是否启用并行执行作业</description>
</property>
<property>
<name>hive.exec.parallel.thread.number</name>
<value>10</value>
<description>并行执行作业的最大线程数</description>
</property>
<property>
<name>hive.exec.reducers.bytes.per.reducer</name>
<value>256000000</value>
<description>每个reducer处理的数据量(字节)</description>
</property>
<property>
<name>hive.exec.reducers.max</name>
<value>1000</value>
<description>最大reducer数量</description>
</property>
<property>
<name>hive.exec.memory</name>
<value>2048</value>
<description>每个map/reduce任务的内存大小(MB)</description>
</property>
<property>
<name>hive.auto.convert.join</name>
<value>true</value>
<description>是否自动将小表转换为map-side join</description>
</property>
<property>
<name>hive.optimize.sort.dynamic.partition</name>
<value>true</value>
<description>是否动态排序动态分区</description>
</property>
-->
</configuration>
自动化学习。

浙公网安备 33010602011771号