# 2. 架构改进

## 2.2 新架构

1. 使用AWS DMS 数据迁移工具，将全量RDS Mysql 数据同步至S3存储中；
2. 通过Flink SQL Batch 作业将S3数据批量写入Hudi 表；
3. 建立Debeizum MySQL binlog 订阅任务，将binlog 数据实时同步至Kafka;
5. 通过hive-hudi meta data sync tools,同步hudi catalog数据至Hive，通过Hive/Trino提供OLAP数据查询。

## 2.3 新架构收益

• 数据使用及开发灵活度提升，地方放同步服务限制明显，改进后的架构易于扩展，并可以提供实时同步数据供其它业务使用；

# 3. 实践要点

## 3.1 Debezium 增量Binlog同步配置

Kafka connect 关键配置信息

bootstrap.servers=localhost:9092
# unique name for the cluster, used in forming the Connect cluster group. Note that this must not conflict with consumer group IDs
group.id=connect-cluster
# The converters specify the format of data in Kafka and how to translate it into Connect data. Every Connect user will
# need to configure these based on the format they want their data in when loaded from or stored into Kafka
key.converter=org.apache.kafka.connect.json.JsonConverter
value.converter=org.apache.kafka.connect.json.JsonConverter
# Converter-specific settings can be passed in by prefixing the Converter's setting with the converter we want to apply
key.converter.schemas.enable=true
value.converter.schemas.enable=true
# Topic to use for storing offsets. This topic should have many partitions and be replicated and compacted.
# Kafka Connect will attempt to create the topic automatically when needed, but you can always manually create
# the topic before starting Kafka Connect if a specific topic configuration is needed.
# Most users will want to use the built-in default replication factor of 3 or in some cases even specify a larger value.
# Since this means there must be at least as many brokers as the maximum replication factor used, we'd like to be able
# to run this example on a single-broker cluster and so here we instead set the replication factor to 1.
offset.storage.topic=connect-offsets
# Topic to use for storing connector and task configurations; note that this should be a single partition, highly replicated,
# and compacted topic. Kafka Connect will attempt to create the topic automatically when needed, but you can always manually create
# the topic before starting Kafka Connect if a specific topic configuration is needed.
# Most users will want to use the built-in default replication factor of 3 or in some cases even specify a larger value.
# Since this means there must be at least as many brokers as the maximum replication factor used, we'd like to be able
# to run this example on a single-broker cluster and so here we instead set the replication factor to 1.
config.storage.topic=connect-configs
# Topic to use for storing statuses. This topic can have multiple partitions and should be replicated and compacted.
# Kafka Connect will attempt to create the topic automatically when needed, but you can always manually create
# the topic before starting Kafka Connect if a specific topic configuration is needed.
# Most users will want to use the built-in default replication factor of 3 or in some cases even specify a larger value.
# Since this means there must be at least as many brokers as the maximum replication factor used, we'd like to be able
# to run this example on a single-broker cluster and so here we instead set the replication factor to 1.
status.storage.topic=connect-status


SQL
MySQL [(none)]> show binary logs;
| mysql-bin-changelog.094531 |    176317 |
| mysql-bin-changelog.094532 |    191443 |
| mysql-bin-changelog.094533 |   1102466 |
| mysql-bin-changelog.094534 |    273347 |
| mysql-bin-changelog.094535 |    141555 |
| mysql-bin-changelog.094536 |      4808 |
| mysql-bin-changelog.094537 |    146217 |
| mysql-bin-changelog.094538 |     29607 |
| mysql-bin-changelog.094539 |    141260 |
+----------------------------+-----------+
MySQL [(none)]> show binlog events in 'mysql-bin-changelog.094539';
MySQL [(none)]> show binlog events in 'mysql-bin-changelog.094539' limit 10;
+----------------------------+-----+----------------+------------+-------------+---------------------------------------------------------------------------+
| Log_name                   | Pos | Event_type     | Server_id  | End_log_pos | Info                                                                      |
+----------------------------+-----+----------------+------------+-------------+---------------------------------------------------------------------------+
| mysql-bin-changelog.094539 |   4 | Format_desc    | 1399745413 |         123 | Server ver: 5.7.31-log, Binlog ver: 4                                     |
| mysql-bin-changelog.094539 | 123 | Previous_gtids | 1399745413 |         194 | 90710e1c-f699-11ea-85c0-0ec6a6bed381:1-108842347                          |


$./bin/kafka-console-producer.sh -bootstrap-server localhost:9092 --topic connect-offsets --property "parse.key=true" --property "key.separator=>"$>["test_servername",{"server":"test_servername"}]>{"ts_sec":1647845014,"file":"mysql-bin-changelog.007051","pos":74121553,"row":1,"server_id":1404217221,"event":2}



{
"name":"test_servername",
"config":{
"connector.class":"io.debezium.connector.mysql.MySqlConnector",
"snapshot.locking.mode":"none",
"database.user":"db_user",
"transforms.Reroute.type":"io.debezium.transforms.ByLogicalTableRouter",
"database.server.id":"1820615119",
"database.history.kafka.bootstrap.servers":"localhost:9092",
"database.history.kafka.topic":"history-topic",
"inconsistent.schema.handling.mode":"skip",
"transforms":"Reroute", // 配置binlog数据转发到一个topic，默认一个表一个topic
"database.server.name":"test_servername",
"transforms.Reroute.topic.regex":"test_servername(.*)",
"database.port":"3306",
"include.schema.changes":"true",
"transforms.Reroute.topic.replacement":"binlog_data_topic",
"table.exclude.list":"table_test",
"database.hostname":"host",
"name":"test_servername",
"database.whitelist":"test_db",
"database.include.list":"test_db",
"snapshot.mode":"schema_only_recovery"  // 使用recovery模式从指定binlog文件的offset同步
}
}



## 3.2 Hudi 全量接增量数据写入

# 写入数据到ODS Raw表
insert_hudi_raw_query = '''
INSERT INTO
{0}_ods_raw.{1}
SELECT
{2}
FROM
{0}_debezium_kafka.kafka_rds_{1}_log
WHERE
REGEXP(GET_JSON_OBJECT(payload, '$.source.table'), '^{3}$')
AND GET_JSON_OBJECT(payload, '$.source.db') = '{4}' AND IF(GET_JSON_OBJECT(payload, \'$.op\') = \'d\', GET_JSON_OBJECT(payload, \'$.before.{5}\'), GET_JSON_OBJECT(payload, \'$.after.{5}\')) IS NOT NULL