表治理-iceberg表手动治理常用命令
一、登录spark客户端
spark-sql --master yarn \
--deploy-mode client \
--queue default \
--name wang \
--driver-memory 12G \
--num-executors 10 \
--executor-cores 4 \
--executor-memory 20G
二、sql查询表信息
1、查询表快照信息
SELECT * FROM spark_catalog.data_lake_ods.test_table.snapshots order by committed_at desc
SELECT count(1) FROM spark_catalog.data_lake_ods.test_table.snapshots
2、查询删除文件
SELECT * FROM data_lake_ods.test_table.all_delete_files
SELECT count(1) FROM data_lake_ods.test_table.all_delete_files
3、查询表快照历史
select * from data_lake_ods.ods_bdg_dp_data_source.history
4、查询manifests
select * from iceberg_catalog.data_lake_ods.ods_bdg_dp_data_source.manifests
三、表治理
1、小文件合并
CALL spark_catalog.system.rewrite_data_files(
table => 'data_lake_ods.test_table',
options => map(
'partial-progress.enabled', 'true',
'rewrite-job-order', 'bytes-asc',
'partial-progress.max-commits', '10000',
'max-file-group-size-bytes', '1073741824',
'rewrite-all','true'
)
);
2、过期快照清理
CALL spark_catalog.system.expire_snapshots(table => 'data_lake_ods.test_table', older_than => TIMESTAMP '2024-04-08 10:00:00.000');
四、表操作
1、spark3.3查询指定时间快照数据
SELECT * FROM data_lake_ods.ods_bdg_dp_data_source TIMESTAMP AS OF '2024-02-29 17:10:31';
SELECT * FROM data_lake_ods.ods_bdg_dp_data_source TIMESTAMP AS OF 499162860;
2、spark3.3查询制定快照数据
SELECT * FROM data_lake_ods.ods_bdg_dp_data_source VERSION AS OF 10963874102873;
3、回滚快照
CALL hadoop_prod.system.rollback_to_timestamp('mydb.mytest', TIMESTAMP '2021-12-23 16:56:40.000')