4 Flink source

一 构建环境

val localEnv = StreamExecutionEnvironment
.createLocalEnvironment() //构建本地环境
val remoteEnv = StreamExecutionEnvironment
.createRemoteEnvironment(
"host",  //  hostname   of  JobManager
1234,  //  port  of   JobManager  process
"path/to/jarFile.jar"
)  //  JAR  file  to   ship  to  the  JobManager //构建远程环境
val env = 
StreamExecutionEnvironment.getExecutionEnvironment //也是获取本地环境
env.
setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //设置处理时间

 

二 flink source

val sensorData: DataStream[SensorReading] = env
.addSource(new  SensorSource) //添加source
//自定义源需要继承RichParallelSourceFunction

1 从文件读取数据

val stream = env.readTextFile(filePath) 

2 从批读取数据

val stream = env
.fromCollection(List(
SensorReading("sensor_1", 1547718199, 35.80018327300259),
SensorReading("sensor_6", 1547718199, 15.402984393403084),
SensorReading("sensor_7", 1547718199, 6.720945201171228),
SensorReading("sensor_10", 1547718199, 38.101067604893444)
))

3 Kafka消息队列的数据为数据来源

val properties =  new  Properties()
//设置kafka参数
properties.setProperty("bootstrap.servers",  "localhost:9092")
properties.setProperty("group.id",  "consumer-group")
properties.setProperty(
"key.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer"
properties.setProperty(
"value.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer"
)
properties.setProperty("auto.offset.reset",  "latest")
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val stream = env
//  source 为来自  Kafka 的数据,这里我们实例化一个消费者,topic 为  hotitems
.addSource(
new  FlinkKafkaConsumer[String](
"hotitems",
new  SimpleStringSchema(),
properties
)
)

4 自定义数据源

package test2

import java.util.Calendar

import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}

import scala.util.Random

// 用来源源不断的产生温度读书,造了一条数据流
// 实现自定义数据源,需要实现`RichParallelSourceFunction`
// 数据源产生的事件类型是`SensorReading`
class SensorSource extends RichParallelSourceFunction[SensorReading] {
  // 表示数据源是否正在运行,`true`表示正在运行
  var running: Boolean = true

  // `run`函数会连续不断的发送`SensorReading`数据
  // 使用`SourceContext`来发送数据
  override def run(ctx: SourceFunction.SourceContext[SensorReading]): Unit = {
    // 初始化随机数发生器,用来产生随机的温度读数
    val rand = new Random

    // 初始化10个(温度传感器ID,温度读数)元组
    // `(1 to 10)`从1遍历到10
    var curFTemp = (1 to 10).map(
      // 使用高斯噪声产生温度读数
      i => ("sensor_" + i, 65 + (rand.nextGaussian() * 20))
    )

    // 无限循环,产生数据流
    while (running) {
      // 更新温度
      curFTemp = curFTemp.map(t => (t._1, t._2 + (rand.nextGaussian() * 0.5)))

      // 获取当前的时间戳,单位是ms
      val curTime = Calendar.getInstance.getTimeInMillis

      // 调用`SourceContext`的`collect`方法来发射出数据
      // Flink的算子向下游发送数据,基本都是`collect`方法
      curFTemp.foreach(t => ctx.collect(SensorReading(t._1, curTime, t._2)))

      // 100ms发送一次数据
      Thread.sleep(300)
    }
  }

  // 当取消任务时,关闭无限循环
  override def cancel(): Unit = running = false
}

package test2

import org.apache.flink.streaming.api.scala._

object SourceFromCustomDataSource {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val stream = env
      // 添加数据源
      .addSource(new SensorSource)

    stream.print()

    env.execute()
  }
}

三 其它

slotSharingGroup() //简历slot组,同一个组内组成一个task
disableChaining() //不让该算子和上下算子组成一个task
posted @ 2020-07-16 23:01  哥的寂寞你不懂  阅读(201)  评论(0)    收藏  举报