hive 分区表 配置动态分区
-- 分区表,动态分区
drop table db_hive1.t2a_trans purge;
CREATE TABLE t2a_trans (
TR_ID STRING COMMENT '交易流水号',
CUST_ID STRING COMMENT '用户ID',
TR_AMT DECIMAL(10,2) COMMENT '交易金额',
product_code STRING COMMENT '产品代码'
--data_dt DATE COMMENT '交易创建时间'
)
COMMENT '交易流水事实表'
PARTITIONED BY (tr_dt date) -- 按照日期分区,分区字段通常使用字符串或日期类型
--ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' -- 使用JSON SerDe
--ROW FORMAT DELIMITED FIELDS TERMINATED BY "|@|"
STORED AS ORC; -- 推荐使用ORC或Parquet等列式存储格式
truncate table db_hive1.t2a_trans;
INSERT INTO db_hive1.t2a_trans
(TR_ID, CUST_ID, TR_AMT, product_code, tr_dt)
SELECT cast(cast(r.id as bigint) as string) as tr_id,
r.sname as cust_id,
round(rand(),2) as tr_cmt,
r.ctfid as producet_code,
to_date(substr(r.version,1,4)||'-01-01','yyyy-MM-dd') as tr_dt
FROM db_hive1.room4 r
where r.version is not null
--and r.sname like '%鬼%'
;
select tr_dt,count(*) from db_hive1.t2a_trans group by tr_dt;
SELECT * FROM db_hive1.t2a_trans;
配置 hive config/hive-site.xml 实现支持动态分区
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://localhost:3306/hive_metadata?allowPublicKeyRetrieval=true&createDatabaseIfNotExist=true&useSSL=false</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>com.mysql.cj.jdbc.Driver</value> <!-- MySQL 8驱动名 -->
</property>
<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>root</value>
</property>
<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>123456</value>
</property>
<property>
<name>hive.metastore.warehouse.dir</name>
<value>/hivedata</value> <!-- HDFS存储路径 -->
</property>
<!-- HiveServer2配置 -->
<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>
<property>
<name>hive.server2.thrift.bind.host</name>
<value>hadoop01</value>
</property>
<property>
<name>hive.metastore.uris</name>
<value>thrift://hadoop01:9083</value>
</property>
<!-- 开启动态分区功能 -->
<property>
<name>hive.exec.dynamic.partition</name>
<value>true</value>
<description>Whether or not to allow dynamic partitions in DML/DDL.</description>
</property>
<!-- 设置动态分区模式为非严格模式 -->
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
<description>In nonstrict mode all partitions are allowed to be dynamic.</description>
</property>
<!-- 设置 Hive 执行引擎为 Tez -->
<property>
<name>hive.execution.engine</name>
<value>mr</value>
</property>
<property>
<name>spark.home</name>
<value>/opt/spark-2.4.5</value>
</property>
<!--
<property>
<name>hive.enable.spark.execution.engine</name>
<value>true</value>
</property>
<property>
<name>spark.home</name>
<value>/opt/spark-3.3.2</value>
</property>
<property>
<name>spark.master</name>
<value>yarn</value>
</property>
-->
<!--
<property>
<name>spark.master</name>
<value>yarn</value>
</property>
-->
<property>
<name>spark.yarn.jars</name>
<value>hdfs://hadoop01:9000/spark-jars/*</value>
</property>
<!--
<property>
<name>hive.exec.parallel</name>
<value>true</value>
<description>是否启用并行执行作业</description>
</property>
<property>
<name>hive.exec.parallel.thread.number</name>
<value>10</value>
<description>并行执行作业的最大线程数</description>
</property>
<property>
<name>hive.exec.reducers.bytes.per.reducer</name>
<value>256000000</value>
<description>每个reducer处理的数据量(字节)</description>
</property>
<property>
<name>hive.exec.reducers.max</name>
<value>1000</value>
<description>最大reducer数量</description>
</property>
<property>
<name>hive.exec.memory</name>
<value>2048</value>
<description>每个map/reduce任务的内存大小(MB)</description>
</property>
<property>
<name>hive.auto.convert.join</name>
<value>true</value>
<description>是否自动将小表转换为map-side join</description>
</property>
<property>
<name>hive.optimize.sort.dynamic.partition</name>
<value>true</value>
<description>是否动态排序动态分区</description>
</property>
-->
</configuration>
自动化学习。

浙公网安备 33010602011771号