flink集群
package org.apache.flink.connector.clickhouse;
import com.alibaba.fastjson.JSON;
import com.baomidou.mybatisplus.core.toolkit.IdWorker;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.state.CheckpointListener;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.connector.clickhouse.internal.AbstractClickHouseOutputFormat;
import org.apache.flink.connector.clickhouse.internal.ClickHouseRowDataSinkFunction;
import org.apache.flink.connector.clickhouse.internal.options.ClickHouseDmlOptions;
import org.apache.flink.runtime.state.FunctionInitializationContext;
import org.apache.flink.runtime.state.FunctionSnapshotContext;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.table.data.GenericRowData;
import org.apache.flink.table.data.RowData;
import org.apache.flink.table.data.StringData;
import org.apache.flink.table.types.DataType;
import org.apache.flink.table.api.DataTypes; // 引入正确的工具类
import org.apache.flink.table.types.logical.LogicalType;
import org.apache.flink.table.types.logical.RowType;
import org.apache.flink.table.types.logical.VarCharType;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SINK_PARTITION_STRATEGY;
import static org.apache.flink.connector.clickhouse.config.ClickHouseConfigOptions.SinkShardingStrategy.HASH;
/**
-
优化后的Flink Kafka到ClickHouse的同步程序
*/
public class FlinkSinkClickhouse {
// 日志工具
private static final Logger LOG = LoggerFactory.getLogger(FlinkSinkClickhouse.class);// 配置常量(可提取到配置文件) 172.31.28.167:9092 3.127.222.63:39092
private static final String KAFKA_BOOTSTRAP_SERVERS = "172.31.28.167:9092";
// private static final String KAFKA_BOOTSTRAP_SERVERS = "3.127.222.63:39092";private static final String KAFKA_TOPIC = "sc_smart_city_cdc";
private static final String KAFKA_GROUP_ID = "flink-clickhouse-consumer-group";
//172.31.43.170:8123 3.127.222.63:28123
// private static final String CLICKHOUSE_URL = "jdbc:clickhouse://current-web-1115249012.eu-central-1.elb.amazonaws.com:38123 ";
private static final String CLICKHOUSE_URL = "jdbc:clickhouse://172.31.43.170:8123,172.31.34.218:8123,172.31.33.16:8123";
// private static final String CLICKHOUSE_URL = "jdbc:clickhouse://3.127.222.63:28123";
private static final String CLICKHOUSE_DB = "sunseeker";
private static final String CLICKHOUSE_TABLE = "ods_countlyV2_all";
private static final String CLICKHOUSE_USER = "default";
private static final String CLICKHOUSE_PASSWORD = "Snk@2024!";
private static final int BATCH_SIZE = 6000;
private static final int FLUSH_INTERVAL_MS = 1000;
private static final int MAX_RETRIES = 3;
// 定义ClickHouse表字段名和对应Flink数据类型(与Mail和表结构匹配)
private static final List<String> FIELD_NAMES = Arrays.asList("appKey", "appVersion", "deviceId", "phone_no");
private static final List<DataType> FIELD_TYPES = Arrays.asList(
DataTypes.STRING(), // appKey
DataTypes.STRING(), // appVersion
DataTypes.STRING(), // deviceId
DataTypes.STRING() // phone_no
);
public static void main(String[] args) throws Exception {
System.out.println("Job completed successfully1");
// 1. 初始化Flink执行环境
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 优化Checkpoint配置:精确一次语义,5秒间隔,超时30秒
env.enableCheckpointing(5000);
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointTimeout(30000);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.setParallelism(8); // 根据集群资源调整并行度
// 2. 配置Kafka消费者
Properties kafkaProps = new Properties();
kafkaProps.setProperty("bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS);
kafkaProps.setProperty("group.id", KAFKA_GROUP_ID);
kafkaProps.setProperty("enable.auto.commit", "false"); // 由Checkpoint管理偏移量
kafkaProps.setProperty("auto.offset.reset", "earliest"); // 消费策略:从最新位置开始
// 创建Kafka消费者(确保group.id在初始化时生效)
FlinkKafkaConsumer<String> kafkaConsumer = new FlinkKafkaConsumer<>(
KAFKA_TOPIC,
new SimpleStringSchema(),
kafkaProps
);
kafkaConsumer.setCommitOffsetsOnCheckpoints(true); // Checkpoint时提交偏移量
// 3. 读取Kafka数据并转换为RowData(适配ClickHouse Sink)
DataStreamSource<String> kafkaSource = env.addSource(kafkaConsumer, "Kafka-Source");
LOG.info("Kafka源初始化完成,主题: {}", KAFKA_TOPIC);
SingleOutputStreamOperator<RowData> dataStream = kafkaSource.map(new MapFunction<String, RowData>() {
@Override
public RowData map(String value) throws Exception {
HashMap<String, String> hashMap = JSON.parseObject(value, HashMap.class);
return GenericRowData.of(
StringData.fromString(hashMap.get("appKey")),
StringData.fromString(hashMap.get("appVersion")),
StringData.fromString(hashMap.get("deviceId")),
StringData.fromString(hashMap.get("phone_no"))
);
// return mail;
}
});
// 4. 配置ClickHouse Sink
// 4.1 构建ClickHouse连接参数
ClickHouseDmlOptions clickHouseOptions = new ClickHouseDmlOptions.Builder()
.withDatabaseName(CLICKHOUSE_DB)
.withTableName(CLICKHOUSE_TABLE)
.withUrl(CLICKHOUSE_URL)
.withUsername(CLICKHOUSE_USER)
.withPassword(CLICKHOUSE_PASSWORD)
.withBatchSize(BATCH_SIZE)
.withFlushInterval(Duration.ofMillis(FLUSH_INTERVAL_MS))
.withMaxRetries(MAX_RETRIES)
.withParallelism(env.getParallelism())
.withUseLocal(Boolean.TRUE)
.withShardingStrategy(HASH)
.withShardingKey("deviceId")
.build();
// 4.2 构建连接属性(可选:超时、连接池等)
Properties clickHouseConnProps = new Properties();
clickHouseConnProps.setProperty("socket_timeout", "300000"); // 5分钟超时
clickHouseConnProps.setProperty("connection_timeout", "10000"); // 10秒连接超时
clickHouseConnProps.setProperty("async_insert", "true"); // 10秒连接超时
// 4.3 构建OutputFormat(字段名和类型与表结构严格匹配)
AbstractClickHouseOutputFormat outputFormat = new AbstractClickHouseOutputFormat.Builder()
.withOptions(clickHouseOptions)
.withConnectionProperties(clickHouseConnProps)
.withFieldNames(FIELD_NAMES.toArray(new String[0]))
.withFieldTypes(FIELD_TYPES.toArray(new DataType[0]))
.withPrimaryKey(new String[]{"deviceId"}) // 假设id是主键
.withPartitionKey(new String[]{"deviceId"}) // 与ClickHouse分片键对齐
.build();
// 4.4 创建Sink并添加到流
SinkFunction<RowData> clickHouseSink = new ClickHouseRowDataSinkFunction(outputFormat);
dataStream
.addSink(clickHouseSink)
.name("ClickHouse-Sink")
.setParallelism(8);
// 5. 执行任务
env.execute("Kafka-To-ClickHouse-Sync");
LOG.info("任务启动成功,开始同步数据...");
}
/**
* JSON字符串转换为RowData的映射器(包含异常处理)
*/
private static class JsonToRowDataMapper extends RichMapFunction<String, RowData> {
private static final long serialVersionUID = 1L;
private transient Logger logger;
private long total = 0; // 总数据量
private long success = 0; // 解析成功
private long fail = 0; // 解析失败
@Override
public void open(Configuration parameters) {
logger = LoggerFactory.getLogger(JsonToRowDataMapper.class);
}
@Override
public RowData map(String jsonStr) throws Exception {
total++;
try {
Mail mail = parseJsonToMail(jsonStr);
// 校验必填字段(若缺失则视为失败)
if (mail.getAppKey() == null || mail.getDeviceId() == null) {
throw new IllegalArgumentException("缺少必填字段");
}
success++;
// 打印每1000条的统计(避免日志刷屏)
if (total % 1000 == 0) {
LOG.info("解析统计:总={}, 成功={}, 失败={}, 失败率={}%",
total, success, fail, (fail * 100.0) / total);
}
return GenericRowData.of(
StringData.fromString(mail.getAppKey()),
StringData.fromString(mail.getAppVersion()),
StringData.fromString(mail.getDeviceId()),
StringData.fromString(mail.getPhone_no())
);
} catch (Exception e) {
fail++;
// 每100条失败数据打印一次具体错误(便于定位问题)
if (fail % 100 == 0) {
LOG.info("第{}条数据解析失败,内容: {}", total, jsonStr.substring(0, Math.min(200, jsonStr.length())), e);
}
return null;
}
}
// 解析JSON为Mail对象(实际可使用Jackson等库,更灵活)
private Mail parseJsonToMail(String jsonStr) {
// 替换为实际的JSON解析逻辑(例如使用Jackson)
// 此处仅为示例,假设jsonStr格式正确
return JSON.parseObject(jsonStr, Mail.class);
}
}
}
浙公网安备 33010602011771号