flink

package org.apache.flink.connector.clickhouse;

import com.alibaba.fastjson.JSON;

import com.baomidou.mybatisplus.core.toolkit.IdWorker;

import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.state.CheckpointListener;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.connector.clickhouse.internal.AbstractClickHouseOutputFormat;
import org.apache.flink.connector.clickhouse.internal.ClickHouseRowDataSinkFunction;
import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.api.DataTypes; // 引入正确的工具类

import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.table.types.logical.VarCharType;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.Duration;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;

/**

  • 优化后的Flink Kafka到ClickHouse的同步程序
    */
    public class FlinkSinkClickhouse {
    // 日志工具
    private static final Logger LOG = LoggerFactory.getLogger(FlinkSinkClickhouse.class);

    // 配置常量(可提取到配置文件) 172.31.28.167:9092 3.127.222.63:39092
    private static final String KAFKA_BOOTSTRAP_SERVERS = "172.31.28.167:9092";
    // private static final String KAFKA_BOOTSTRAP_SERVERS = "3.127.222.63:39092";

    private static final String KAFKA_TOPIC = "sc_smart_city_cdc";
    private static final String KAFKA_GROUP_ID = "flink-clickhouse-consumer-group";
    //172.31.43.170:8123 3.127.222.63:28123
    // private static final String CLICKHOUSE_URL = "jdbc:clickhouse://current-web-1115249012.eu-central-1.elb.amazonaws.com:38123 ";
    private static final String CLICKHOUSE_URL = "jdbc:clickhouse://172.31.43.170:8123,172.31.34.218:8123,172.31.33.16:8123";

// private static final String CLICKHOUSE_URL = "jdbc:clickhouse://3.127.222.63:28123";

private static final String CLICKHOUSE_DB = "sunseeker";
private static final String CLICKHOUSE_TABLE = "ods_countlyV2";
private static final String CLICKHOUSE_USER = "default";
private static final String CLICKHOUSE_PASSWORD = "Snk@2024!";
private static final int BATCH_SIZE = 6000;
private static final int FLUSH_INTERVAL_MS = 1000;
private static final int MAX_RETRIES = 3;

// 定义ClickHouse表字段名和对应Flink数据类型(与Mail和表结构匹配)
private static final List<String> FIELD_NAMES = Arrays.asList("appKey", "appVersion", "deviceId", "phone_no");
private static final List<DataType> FIELD_TYPES = Arrays.asList(
        DataTypes.STRING(),    // appKey
        DataTypes.STRING(),    // appVersion
        DataTypes.STRING(),    // deviceId
        DataTypes.STRING()     // phone_no
);

public static void main(String[] args) throws Exception {
    System.out.println("Job completed successfully1");

    // 1. 初始化Flink执行环境
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // 优化Checkpoint配置:精确一次语义,5秒间隔,超时30秒
    env.enableCheckpointing(5000);
    env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
    env.getCheckpointConfig().setCheckpointTimeout(30000);
    env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
    env.setParallelism(8);  // 根据集群资源调整并行度

    // 2. 配置Kafka消费者
    Properties kafkaProps = new Properties();
    kafkaProps.setProperty("bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS);
    kafkaProps.setProperty("group.id", KAFKA_GROUP_ID);
    kafkaProps.setProperty("enable.auto.commit", "false");  // 由Checkpoint管理偏移量
    kafkaProps.setProperty("auto.offset.reset", "earliest");   // 消费策略:从最新位置开始

    // 创建Kafka消费者(确保group.id在初始化时生效)
    FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>(
            KAFKA_TOPIC,
            new SimpleStringSchema(),
            kafkaProps
    );
    kafkaConsumer.setCommitOffsetsOnCheckpoints(true);  // Checkpoint时提交偏移量

    // 3. 读取Kafka数据并转换为RowData(适配ClickHouse Sink)
    DataStreamSource<String> kafkaSource = env.addSource(kafkaConsumer, "Kafka-Source");
    LOG.info("Kafka源初始化完成,主题: {}", KAFKA_TOPIC);

    SingleOutputStreamOperator<RowData> dataStream = kafkaSource.map(new MapFunction<String, RowData>() {
        @Override
        public RowData map(String value) throws Exception {
            HashMap<String, String> hashMap = JSON.parseObject(value, HashMap.class);

            return GenericRowData.of(
                    StringData.fromString(hashMap.get("appKey")),
                    StringData.fromString(hashMap.get("appVersion")),
                    StringData.fromString(hashMap.get("deviceId")),
                    StringData.fromString(hashMap.get("phone_no"))
            );

// return mail;
}
});
// 4. 配置ClickHouse Sink
// 4.1 构建ClickHouse连接参数
ClickHouseDmlOptions clickHouseOptions = new ClickHouseDmlOptions.Builder()
.withDatabaseName(CLICKHOUSE_DB)
.withTableName(CLICKHOUSE_TABLE)
.withUrl(CLICKHOUSE_URL)
.withUsername(CLICKHOUSE_USER)
.withPassword(CLICKHOUSE_PASSWORD)
.withBatchSize(BATCH_SIZE)
.withFlushInterval(Duration.ofMillis(FLUSH_INTERVAL_MS))
.withMaxRetries(MAX_RETRIES)
.withParallelism(env.getParallelism())
.build();

    // 4.2 构建连接属性(可选:超时、连接池等)
    Properties clickHouseConnProps = new Properties();
    clickHouseConnProps.setProperty("socket_timeout", "300000");  // 5分钟超时
    clickHouseConnProps.setProperty("connection_timeout", "10000");  // 10秒连接超时
    clickHouseConnProps.setProperty("async_insert", "true");  // 10秒连接超时

    // 4.3 构建OutputFormat(字段名和类型与表结构严格匹配)
    AbstractClickHouseOutputFormat outputFormat = new AbstractClickHouseOutputFormat.Builder()
            .withOptions(clickHouseOptions)
            .withConnectionProperties(clickHouseConnProps)
            .withFieldNames(FIELD_NAMES.toArray(new String[0]))
            .withFieldTypes(FIELD_TYPES.toArray(new DataType[0]))
            .withPrimaryKey(new String[]{"deviceId"})  // 假设id是主键
            .withPartitionKey(new String[]{})  // 无分区键
            .build();
    // 4.4 创建Sink并添加到流
    SinkFunction<RowData> clickHouseSink = new ClickHouseRowDataSinkFunction(outputFormat);
    dataStream
            .addSink(clickHouseSink)
            .name("ClickHouse-Sink")
            .setParallelism(8);
    // 5. 执行任务
    env.execute("Kafka-To-ClickHouse-Sync");
    LOG.info("任务启动成功,开始同步数据...");

}

/**
 * JSON字符串转换为RowData的映射器(包含异常处理)
 */
private static class JsonToRowDataMapper extends RichMapFunction<String, RowData> {
    private static final long serialVersionUID = 1L;
    private transient Logger logger;
    private long total = 0;       // 总数据量
    private long success = 0;     // 解析成功
    private long fail = 0;        // 解析失败

    @Override
    public void open(Configuration parameters) {
        logger = LoggerFactory.getLogger(JsonToRowDataMapper.class);
    }

    @Override
    public RowData map(String jsonStr) throws Exception {
        total++;
        try {
            Mail mail = parseJsonToMail(jsonStr);
            // 校验必填字段(若缺失则视为失败)
            if (mail.getAppKey() == null || mail.getDeviceId() == null) {
                throw new IllegalArgumentException("缺少必填字段");
            }
            success++;
            // 打印每1000条的统计(避免日志刷屏)
            if (total % 1000 == 0) {
                LOG.info("解析统计:总={}, 成功={}, 失败={}, 失败率={}%",
                        total, success, fail, (fail * 100.0) / total);
            }
            return GenericRowData.of(
                    StringData.fromString(mail.getAppKey()),
                    StringData.fromString(mail.getAppVersion()),
                    StringData.fromString(mail.getDeviceId()),
                    StringData.fromString(mail.getPhone_no())
            );
        } catch (Exception e) {
            fail++;
            // 每100条失败数据打印一次具体错误(便于定位问题)
            if (fail % 100 == 0) {
                LOG.info("第{}条数据解析失败,内容: {}", total, jsonStr.substring(0, Math.min(200, jsonStr.length())), e);
            }
            return null;
        }
    }


    // 解析JSON为Mail对象(实际可使用Jackson等库,更灵活)
    private Mail parseJsonToMail(String jsonStr) {
        // 替换为实际的JSON解析逻辑(例如使用Jackson)
        // 此处仅为示例,假设jsonStr格式正确
        return JSON.parseObject(jsonStr, Mail.class);
    }
}

}

4.0.0

org.apache.flink
flink-connector-clickhouse-parent
1.0.0-SNAPSHOT

<name>Flink : Connectors : Clickhouse : E2E Tests</name>
<artifactId>flink-connector-clickhouse-runner</artifactId>
<url>https://flink.apache.org</url>
<packaging>jar</packaging>

<properties>
	<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
	<flink.version>1.17.0</flink.version>
	<testcontainer.version>1.17.3</testcontainer.version>
	<clickhouse-jdbc.version>0.4.6</clickhouse-jdbc.version>
	<scala.binary.version>2.12</scala.binary.version>
</properties>

<repositories>
	<!-- 阿里云仓库(国内加速) -->
	<repository>
		<id>aliyun</id>
		<url>https://maven.aliyun.com/repository/public</url>
	</repository>
	<!-- Maven中央仓库 -->
	<repository>
		<id>central</id>
		<url>https://repo1.maven.org/maven2</url>
	</repository>
	<!-- Apache快照仓库(用于Flink快照版本) -->
	<repository>
		<id>apache-snapshots</id>
		<url>https://repository.apache.org/content/repositories/snapshots/</url>
		<snapshots>
			<enabled>true</enabled>
		</snapshots>
	</repository>
	<!-- Testcontainers仓库 -->
	<repository>
		<id>testcontainers</id>
		<url>https://packages.testcontainers.com/maven/</url>
	</repository>
</repositories>

<dependencies>
	<dependency>
		<groupId>com.baomidou</groupId>
		<artifactId>mybatis-plus-core</artifactId>
		<version>3.5.3.1</version>
	</dependency>
	<dependency>
		<groupId>org.apache.kafka</groupId>
		<artifactId>kafka-clients</artifactId>
		<version>3.9.0</version>
	</dependency>
	<dependency>
		<groupId>org.apache.flink</groupId>
		<artifactId>flink-connector-kafka_2.12</artifactId>
		<version>1.14.0</version>
	</dependency>
	<dependency>
		<groupId>com.alibaba</groupId>
		<artifactId>fastjson</artifactId>
		<version>1.2.59</version>
	</dependency>
	<dependency>
		<groupId>org.testcontainers</groupId>
		<artifactId>clickhouse</artifactId>
		<version>${testcontainer.version}</version>
	</dependency>
	<dependency>
		<groupId>org.apache.flink</groupId>
		<artifactId>flink-connector-test-utils</artifactId>
		<version>${flink.version}</version>
	</dependency>
	<dependency>
		<groupId>org.apache.flink</groupId>
		<artifactId>flink-table-api-java-bridge</artifactId>
		<version>${flink.version}</version>
	</dependency>
	<dependency>
		<groupId>org.apache.flink</groupId>
		<artifactId>flink-connector-clickhouse</artifactId>
		<version>1.0.0-SNAPSHOT</version>
	</dependency>
</dependencies>
posted @ 2025-07-30 10:26  kevinWwm  阅读(11)  评论(0)    收藏  举报