hive 分区表 二级分区 分区分桶表 动态分区

-- 分区表

-- 分区表,动态分区
drop table db_hive1.t2a_trans purge;

CREATE EXTERNAL TABLE t2a_trans (
    TR_ID STRING COMMENT '交易流水号',
    CUST_ID STRING COMMENT '用户ID',
    TR_AMT DECIMAL(10,2) COMMENT '交易金额',
    product_code STRING COMMENT '产品代码'
    --data_dt DATE COMMENT '交易创建时间'
)
COMMENT '交易流水事实表'
PARTITIONED BY (tr_dt date) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'  -- 使用JSON SerDe
--ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS ORC; -- 推荐使用ORC或Parquet等列式存储格式

truncate table db_hive1.t2a_trans;


INSERT INTO db_hive1.t2a_trans
(TR_ID, CUST_ID, TR_AMT, product_code, tr_dt)
SELECT cast(cast(r.id as bigint) as string) as tr_id,
       r.sname as cust_id,
       round(rand(),2) as tr_cmt,
       r.ctfid as producet_code,
       to_date(substr(r.version,1,4)||'-01-01','yyyy-MM-dd') as tr_dt
       FROM db_hive1.room4 r
    where r.version is not null
    --and r.sname like '%鬼%'
;

select tr_dt,count(*) from db_hive1.t2a_trans t where t.tr_dt>=date'2010-01-01' group by tr_dt order by t.tr_dt limit 100;

SELECT * FROM db_hive1.t2a_trans;

select * from test_db.room2_view;

-- 窗口函数 缺省 省略不写

show create table db_hive1.t2a_trans;

SHOW PARTITIONS t2a_trans;

-- 增加一个分区
ALTER TABLE t2a_trans ADD PARTITION (tr_dt = DATE "2022-11-01");
ALTER TABLE t2a_trans ADD PARTITION (tr_dt = DATE "2022-11-01") PARTITION (tr_dt = DATE "2022-11-02");

alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01");
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01"),PARTITION (tr_dt = DATE "2022-11-02");


TRUNCATE TABLE t2a_trans PARTITION (tr_dt = DATE "2022-11-01");

SELECT * FROM t2a_trans where tr_dt = DATE "2012-01-01";

-- 修复分区
alter table t2a_trans add partition (tr_dt = DATE "2022-11-01");

-- 手动删除 hdfs路径上的数据 元数据不会发生变化,次数应 删除对应分区 保持一致
alter table t2a_trans drop PARTITION (tr_dt = DATE "2022-11-01");

-- 通用修复元数据的方式 msck
msck  repair table t2a_trans add partitions ; -- 自动添加hdfs数据存在,元数据缺失的情况,新增元数据
msck repair table t2a_trans drop partitions ;-- 自动删除 路径不存在 元数据存在的情况 删除元数据
msck repair table t2a_trans sync partitions ; -- 保证hdfs路径和元数据自动同步

msck repair table t2a_trans ; -- 默认值自动同步元数据和hdfs路径


-- 分区表之 二级分区


DROP  TABLE db_hive1.t2a_trans_hst PURGE;
-- EXTERNAL
CREATE  TABLE db_hive1.t2a_trans_hst (
    TR_ID STRING COMMENT '交易流水号',
    CUST_ID STRING COMMENT '用户ID',
    TR_AMT DECIMAL(10,2) COMMENT '交易金额',
    product_code STRING COMMENT '产品代码'
    --data_dt DATE COMMENT '交易创建时间'
)
COMMENT 'jiaoyiliushuibiao'
PARTITIONED BY (year_dt string,month_dt string) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'  -- 使用JSON SerDe
--ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS ORC -- 推荐使用ORC或Parquet等列式存储格式
--location "/hivedata/db_hive1/t2a_trans_hst"
;


desc extended db_hive1.t2a_trans_hst;

-- spark-thriftserver 不支持执行  直接使用 hive jdbc
truncate table db_hive1.t2a_trans_hst;



-- SET hive.exec.dynamic.partition=true; -- 开启动态分区
-- SET hive.exec.dynamic.partition.mode=nonstrict;


insert into db_hive1.t2a_trans_hst PARTITION (year_dt,month_dt)
select A.tr_id,A.CUST_ID,A.TR_AMT,A.product_code ,year(A.tr_dt) as year_dt, month(A.tr_dt) as month_dt
       from db_hive1.t2a_trans A
        where year(A.tr_dt) is not null and month(A.tr_dt) is not null
        and a.cust_id is not null
;


select year_dt,count(*) from db_hive1.t2a_trans_hst group by year_dt order by count(1) desc limit 100;

select * from db_hive1.t2a_trans_hst T WHERE instr(T.CUST_ID,"吴")>0 AND T.CUST_ID NOT LIKE '吴%';

select * from db_hive1.t2a_trans_hst T where t.CUST_ID is not null;


drop table db_hive1.stu_buck purge ;
// 分桶表 后续优化有用到
create table stu_buck(
    id string,
    name string
)clustered by (id)  -- 按照哈希值进行分区 也就是分桶表
    sorted by (id)
into 4 buckets  -- 4个文件块
--row format delimited fields terminated by "|@|"
STORED AS ORC
;

desc extended stu_buck ;

select * from db_hive1.stu_buck;

insert into stu_buck
(id,name)
select t.tr_id,t.CUST_ID from db_hive1.t2a_trans t;

select * from db_hive1.stu_buck;

load data local inpath "文件路径"
into table db_hive1.stu_buck;


-- 分桶排序表
create table stu_buck(
    id string,
    name string
)clustered by (id)  -- 按照哈希值进行分区 也就是分桶表
    sorted by (id)
into 4 buckets  -- 4个文件块
row format delimited fields terminated by "^"
;

SELECT * FROM db_hive1.stu_buck;

select t.gender,sum(t.cnt) from test_db.room2_gender_out t  group by t.gender order by sum(t.cnt) desc;

  

hive-site.xml配置 动态分区和非严格模式

<configuration>
  <property>
    <name>javax.jdo.option.ConnectionURL</name>
    <value>jdbc:mysql://localhost:3306/hive_metadata?allowPublicKeyRetrieval=true&createDatabaseIfNotExist=true&useSSL=false</value>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionDriverName</name>
    <value>com.mysql.cj.jdbc.Driver</value>  <!-- MySQL 8驱动名 -->
  </property>
  <property>
    <name>javax.jdo.option.ConnectionUserName</name>
    <value>root</value>
  </property>
  <property>
    <name>javax.jdo.option.ConnectionPassword</name>
    <value>123456</value>
  </property>

    <property>
    <name>hive.metastore.warehouse.dir</name>
    <value>/hivedata</value> <!-- HDFS存储路径 -->
  </property> 

   <!-- HiveServer2配置 -->
    <property>
        <name>hive.server2.thrift.port</name>
        <value>10000</value>
    </property>
    <property>
        <name>hive.server2.thrift.bind.host</name>
        <value>hadoop01</value>
    </property>
    <property>
        <name>hive.metastore.uris</name>
        <value>thrift://hadoop01:9083</value>
    </property>

<!-- 开启动态分区功能 -->
  <property>
    <name>hive.exec.dynamic.partition</name>
    <value>true</value>
    <description>Whether or not to allow dynamic partitions in DML/DDL.</description>
  </property>
  <!-- 设置动态分区模式为非严格模式 -->
  <property>
    <name>hive.exec.dynamic.partition.mode</name>
    <value>nonstrict</value>
    <description>In nonstrict mode all partitions are allowed to be dynamic.</description>
  </property>

<property>
    <name>hive.error.on.empty.partition</name>
    <value>false</value>
    <description>查询语句返回空时不报错</description>
  </property>



<!-- 设置 Hive 执行引擎为 Tez -->
  <property>
    <name>hive.execution.engine</name>
    <value>mr</value>
  </property>

<property>
  <name>spark.home</name>
  <value>/opt/spark-2.4.5</value>
</property>

 
<!--
 <property>
    <name>hive.enable.spark.execution.engine</name>
    <value>true</value>
  </property>

 <property>
    <name>spark.home</name>
    <value>/opt/spark-3.3.2</value>
  </property>
<property>
    <name>spark.master</name>
    <value>yarn</value>
  </property>
  -->
<!--
<property>
  <name>spark.master</name>
  <value>yarn</value>
</property>
-->
<property>
  <name>spark.yarn.jars</name>
  <value>hdfs://hadoop01:9000/spark-jars/*</value>
</property>


<!--
 <property>
   <name>hive.exec.parallel</name>
   <value>true</value>
   <description>是否启用并行执行作业</description>
 </property>
 <property>
   <name>hive.exec.parallel.thread.number</name>
   <value>10</value>
   <description>并行执行作业的最大线程数</description>
 </property>
 <property>
   <name>hive.exec.reducers.bytes.per.reducer</name>
   <value>256000000</value>
   <description>每个reducer处理的数据量(字节)</description>
 </property>
 <property>
   <name>hive.exec.reducers.max</name>
   <value>1000</value>
   <description>最大reducer数量</description>
 </property>
 <property>
   <name>hive.exec.memory</name>
   <value>2048</value>
   <description>每个map/reduce任务的内存大小(MB)</description>
 </property>
 <property>
   <name>hive.auto.convert.join</name>
   <value>true</value>
   <description>是否自动将小表转换为map-side join</description>
 </property>
 <property>
   <name>hive.optimize.sort.dynamic.partition</name>
   <value>true</value>
   <description>是否动态排序动态分区</description>
 </property>
-->


</configuration>

  

 

posted @ 2025-12-14 00:09  ARYOUOK  阅读(17)  评论(0)    收藏  举报