Iceberg常用命令
一、登录spark客户端
spark-sql --master yarn \
--deploy-mode client \
--queue default \
--name wang \
--driver-memory 12G \
--num-executors 10 \
--executor-cores 2 \
--executor-memory 10G二、sql查询表信息
1、查询表快照信息
SELECT * FROM spark_catalog.data_lake_ods.test_table.snapshots order by committed_at desc
SELECT count(1) FROM spark_catalog.data_lake_ods.test_table.snapshots
2、查询表数据文件
--查询当前快照对应的数据文件
SELECT count(1) FROM data_lake_ods_test.dwd_mkt_uqun_call_detail.data_files
--查询表所有数据文件
SELECT count(1) FROM data_lake_ods_test.dwd_mkt_uqun_call_detail.all_data_files
3、查询删除文件
--查询当前快照对应的删除文件
SELECT count(1) FROM data_lake_ods.test_table.delete_files
--查询所有删除文件
SELECT * FROM data_lake_ods.test_table.all_delete_files
SELECT count(1) FROM data_lake_ods.test_table.all_delete_files
4、查询表快照历史
select * from data_lake_ods.ods_bdg_dp_data_source.history
5、查询manifests
select * from iceberg_catalog.data_lake_ods.ods_bdg_dp_data_source.manifests
三、Iceberg表DDL操作
1、建非分区表
--建表。 布隆筛选建议和主键一样,最多不超过3个
CREATE TABLE data_lake_ods.test123(
  changed_type int comment 'null',
  id bigint comment 'id',
  create_time string comment '创建时间',
  update_time string comment '更新时间'
)USING iceberg
TBLPROPERTIES(
  'format-version'='2'
  ,'write.upsert.enabled'='true'
  ,'engine.hive.enabled'='true'
  ,'write.distribution-mode' = 'hash'
  ,'write.metadata.metrics.default'='full'
  ,'write.parquet.bloom-filter-enabled.column.id' = 'true'
  ,'table.drop.base-path.enabled'='true'
);2、建分桶表
--分桶表。布隆筛选建议和主键一样,最多不超过3个
CREATE TABLE spark_catalog.data_lake_ods_test.test_bucket_month (
  `order_number` BIGINT COMMENT '订单编号',
  `user_number` BIGINT COMMENT '用户编号',
  `pay_number` BIGINT COMMENT '支付编号',
  `spu_number` BIGINT COMMENT '商品spu编号',
  `sku_number` BIGINT COMMENT '商品sku编号',
  `kafka_timestamp` TIMESTAMP COMMENT 'kafka写入时间戳'
)USING iceberg
PARTITIONED BY (bucket(12,order_number))
TBLPROPERTIES(
  'format-version'='2'
  ,'write.upsert.enabled'='true'
  ,'engine.hive.enabled'='true'
  ,'write.distribution-mode' = 'hash'
  ,'write.metadata.metrics.default'='full'
  ,'write.parquet.bloom-filter-enabled.column.id' = 'true'
  ,'table.drop.base-path.enabled'='true'
);其它优化参数
TBLPROPERTIES (
  'write.merge.mode' = 'merge-on-read',
  'write.update.mode' = 'merge-on-read',
  'write.delete.mode' ='merge-on-read',
  'write.metadata.delete-after-commit.enabled' = 'true',
  //保留metadata.json个数,会删除历史的metadata.json,但不会清理snapshot(即manifestList、manifest、data file、delete file都不会清理)
  'write.metadata.previous-versions-max' = '10',  
);3、修改表属性
alter table data_lake_ods.test123 SET TBLPROPERTIES('write.metadata.metrics.default'='full','write.distribution-mode' = 'hash');4、删除表
drop table data_lake_ods.test123 purge;
四、表治理
1、小文件合并(非分区表)
CALL spark_catalog.system.rewrite_data_files(
table => 'data_lake_ods.test123',
options => map(
  'partial-progress.enabled', 'true',
  'rewrite-job-order', 'bytes-asc',
  'partial-progress.max-commits', '10000',
  'max-file-group-size-bytes', '1073741824',
  'target-file-size-bytes','134217728',
  'rewrite-all','true'
  )
);2、小文件合并(分区表)
CALL spark_catalog.system.rewrite_data_files(
table => 'data_lake_ods_test.test123',
options => map(
  'partial-progress.enabled', 'true',
  'rewrite-job-order', 'bytes-asc',
  'partial-progress.max-commits', '10000',
  'max-file-group-size-bytes', '1073741824',
  'target-file-size-bytes','134217728',
  'remove-dangling-deletes','true',
  'rewrite-all','true'
  )
);3、治理删除文件特别多的单表(先把删除文件干掉,然后再执行上面治理命令)
CALL spark_catalog.system.rewrite_data_files(
table => 'data_lake_ods.test123',
options => map(
  'max-concurrent-file-group-rewrites','19',
  'max-file-group-size-bytes','1',
  'partial-progress.enabled', 'true',
  'rewrite-all','true'
  )
);4、过期快照清理
CALL spark_catalog.system.expire_snapshots(table => 'data_lake_ods.test_table', older_than => TIMESTAMP '2024-04-08 10:00:00.000');五、表运维
1、spark3.3查询指定时间快照数据
SELECT * FROM data_lake_ods.ods_bdg_dp_data_source TIMESTAMP AS OF '2024-02-29 17:10:31';
SELECT * FROM data_lake_ods.ods_bdg_dp_data_source TIMESTAMP AS OF 499162860;
2、spark3.3查询制定快照数据
SELECT * FROM data_lake_ods.ods_bdg_dp_data_source VERSION AS OF 10963874102873;
3、回滚快照
CALL hadoop_prod.system.rollback_to_timestamp('mydb.mytest', TIMESTAMP '2021-12-23 16:56:40.000')
六、优秀文章
1、网易数帆
https://www.6aiq.com/article/1686471152273
 
                    
                     
                    
                 
                    
                
 
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号