使用iceberg-flink读取iceberg v2表
一、背景
mysql数据入湖后,有同事需要实时抽取iceberg v2表,想通过iceberg做分钟级实时数仓。目前flink社区暂不支持读取v2表。腾讯内部支持
目前只能用Oceanus内置connector,支持flink1.13版本。需要读写时都用iceberg-1.1去处理,因为写入是定制iceberg有对iceberg格式内容做增强,增强后才支持流式读取。
流式读取做加工时,sum函数因iceberg不支持,具体内容看图三
流式写入v2表
CREATE TABLE `iceberg_sink`(
`id` int,
`name` char(50),
`age` int,
`weight` double,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'iceberg-1.1',
'incremental.sequence.mode'='true',
'format-version' = '2',
'write.upsert.enabled' = 'true'
-- other properties ...
);
insert into iceberg_sink select * from xxx_source;
流式读取v2表
CREATE TABLE `iceberg_source`(
`id` int,
`name` char(50),
`age` int,
`weight` double,
PRIMARY KEY (id) NOT ENFORCED
) WITH (
'connector' = 'iceberg-1.1',
'incremental.sequence.mode'='true',
'format-version' = '2',
'write.upsert.enabled' = 'true'
'connector.iceberg.starting-strategy' = 'TABLE_SCAN_THEN_INCREMENTAL'
-- connector.iceberg.starting-strategy 有 4 个选项:
-- (1) TABLE_SCAN_THEN_INCREMENTAL: 先全量读,再增量读
-- (2) INCREMENTAL_FROM_LATEST_SNAPSHOT: 从最新的 snapshot 开始增量读 (inclusive)
-- (3) INCREMENTAL_FROM_EARLIEST_SNAPSHOT: 从最老的 snapshot 开始增量读 (inclusive)
-- (4) INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: 从指定时间戳的快照开始增量读(inclusive)。如果时间戳在两个快照之间,则应该从时间戳之后的快照开始。
-- other properties ...
);
insert into xxx_sink select * from iceberg_source;