Flink 整合 Kafka 之 电信案例 SQL 版、Flink SQL 开启 checkpoint

Flink 整合 Kafka 之 电信案例 SQL 版

Flink 整合 Kafka 之 电信案例

package com.shujia.flink.dx

import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.EnvironmentSettings
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment

object Demo3CityFlowOnSql {
  def main(args: Array[String]): Unit = {
    //创建flink 环境
    val bsEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment

    //设置table 环境的一些参数
    val bsSettings: EnvironmentSettings = EnvironmentSettings.newInstance()
      .useBlinkPlanner() //使用blink计划器
      .inStreamingMode() //流模式
      .build()

    // 创建flink table 环境
    val bsTableEnv: StreamTableEnvironment = StreamTableEnvironment.create(bsEnv, bsSettings)

    /**
      * 1、读取kafka中电信用户位置数据
      *
      */

    bsTableEnv.executeSql(
      """
        |
        |CREATE TABLE dianxin (
        |mdn STRING,
        |grid STRING,
        |city STRING,
        |county STRING,
        |tTime INT,
        |start_time STRING,
        |end_time STRING,
        |`date` STRING
        |) WITH (
        | 'connector' = 'kafka',
        | 'topic' = 'dianxin1',
        | 'properties.bootstrap.servers' = 'master:9092,node1:9092,node2:9092',
        | 'properties.group.id' = 'testGroup',
        | 'format' = 'csv',
        | 'scan.startup.mode' = 'earliest-offset',
        | 'csv.ignore-parse-errors' = 'true' //忽略数据解析错误
        |)
        |
      """.stripMargin)

    bsTableEnv.executeSql(
      """
        |CREATE TABLE print_table(
        |city STRING,
        |num BIGINT
        |)
        |WITH ('connector' = 'print')
        |
      """.stripMargin)

    //将结果写入MySQL
    bsTableEnv.executeSql(
      """
        |CREATE TABLE city_num (
        |  city STRING,
        |  num BIGINT,
        |  PRIMARY KEY (city) NOT ENFORCED
        |) WITH (
        |   'connector' = 'jdbc',
        |   'url' = 'jdbc:mysql://master:3306/bigdata',
        |   'table-name' = 'city_count',
        |   'username' = 'root',
        |   'password' = '123456'
        |)
        |
      """.stripMargin)

    bsTableEnv.executeSql(
      """
        |insert into city_num
        |select
        |city,
        |count(distinct mdn) as num
        |from dianxin
        |group by city
        |
      """.stripMargin)

    /**
      * 将代码打包上传集群,任务提交命令如下,可能会报一堆错误,总的来说就是 Flink lib 目录下缺少了依赖jar包,单独将缺少了的依赖jar包上传至 Flink lib 目录下即可
      * flink run -m yarn-cluster  -yjm 1024m -ytm 1096m -c com.shujia.flink.dx.Demo3CityFlowOnSql flink-1.0.jar
      */
  }
}

Flink SQL 开启 checkpoint

加上开启 checkpoint 的代码即可

package com.shujia.flink.table

import org.apache.flink.runtime.state.filesystem.FsStateBackend
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.EnvironmentSettings
import org.apache.flink.table.api.bridge.scala.StreamTableEnvironment

object Demo11CheckPoint {
  def main(args: Array[String]): Unit = {
    //创建flink 环境
    val bsEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    bsEnv.setParallelism(1)

    // 每 1000ms 开始一次 checkpoint
    bsEnv.enableCheckpointing(1000)

    // 高级选项:
    // 设置模式为精确一次 (这是默认值)
    bsEnv.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)

    // 确认 checkpoints 之间的时间会进行 500 ms
    bsEnv.getCheckpointConfig.setMinPauseBetweenCheckpoints(500)

    // Checkpoint 必须在一分钟内完成,否则就会被抛弃
    bsEnv.getCheckpointConfig.setCheckpointTimeout(60000)

    // 同一时间只允许一个 checkpoint 进行
    bsEnv.getCheckpointConfig.setMaxConcurrentCheckpoints(1)

    //当作业取消时,保留作业的 checkpoint。注意,这种情况下,需要手动清除该作业保留的 checkpoint。
    bsEnv.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)

    //将状态保存到hdfs的状态后端
    val stateBackend = new FsStateBackend("hdfs://master:9000/flink/checkpoint")

    //设置状态后端
    bsEnv.setStateBackend(stateBackend)

    //设置table 环境的一些参数
    val bsSettings: EnvironmentSettings = EnvironmentSettings.newInstance()
      .useBlinkPlanner() //使用blink计划器
      .inStreamingMode() //流模式
      .build()

    // 创建flink table 环境
    val bsTableEnv: StreamTableEnvironment = StreamTableEnvironment.create(bsEnv, bsSettings)

    bsTableEnv.executeSql(
      """
        |
        |CREATE TABLE dianxin (
        |mdn STRING,
        |grid STRING,
        |city STRING,
        |county STRING,
        |tTime INT,
        |start_time STRING,
        |end_time STRING,
        |`date` STRING
        |) WITH (
        | 'connector' = 'kafka',
        | 'topic' = 'dianxin1',
        | 'properties.bootstrap.servers' = 'master:9092,node1:9092,node2:9092',
        | 'properties.group.id' = 'testGroup',
        | 'format' = 'csv',
        | 'scan.startup.mode' = 'earliest-offset',
        | 'csv.ignore-parse-errors' = 'true'
        |)
        |
      """.stripMargin)

    bsTableEnv.executeSql(
      """
        |CREATE TABLE print_table(
        |city STRING,
        |num BIGINT
        |)
        |WITH ('connector' = 'print')
        |
      """.stripMargin)

    bsTableEnv.executeSql(
      """
        |insert into print_table
        |select
        |city,
        |count(distinct mdn) as num
        |from dianxin
        |group by city
        |
      """.stripMargin)

  }
}
posted @ 2022-03-26 16:09  赤兔胭脂小吕布  阅读(984)  评论(2)    收藏  举报