一 构建环境
val localEnv = StreamExecutionEnvironment
.createLocalEnvironment() //构建本地环境
val remoteEnv = StreamExecutionEnvironment
.createRemoteEnvironment(
"host", // hostname of JobManager
1234, // port of JobManager process
"path/to/jarFile.jar"
) // JAR file to ship to the JobManager //构建远程环境
val env =
StreamExecutionEnvironment.getExecutionEnvironment //也是获取本地环境
env.
setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //设置处理时间
二 flink source
val sensorData: DataStream[SensorReading] = env
.addSource(new SensorSource) //添加source
//自定义源需要继承RichParallelSourceFunction
1 从文件读取数据
val stream = env.readTextFile(filePath)
2 从批读取数据
val stream = env
.fromCollection(List(
SensorReading("sensor_1", 1547718199, 35.80018327300259),
SensorReading("sensor_6", 1547718199, 15.402984393403084),
SensorReading("sensor_7", 1547718199, 6.720945201171228),
SensorReading("sensor_10", 1547718199, 38.101067604893444)
))
3 Kafka消息队列的数据为数据来源
val properties = new Properties()
//设置kafka参数
properties.setProperty("bootstrap.servers", "localhost:9092")
properties.setProperty("group.id", "consumer-group")
properties.setProperty(
"key.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer"
properties.setProperty(
"value.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer"
)
properties.setProperty("auto.offset.reset", "latest")
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
val stream = env
// source 为来自 Kafka 的数据,这里我们实例化一个消费者,topic 为 hotitems
.addSource(
new FlinkKafkaConsumer[String](
"hotitems",
new SimpleStringSchema(),
properties
)
)
4 自定义数据源
package test2
import java.util.Calendar
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import scala.util.Random
// 用来源源不断的产生温度读书,造了一条数据流
// 实现自定义数据源,需要实现`RichParallelSourceFunction`
// 数据源产生的事件类型是`SensorReading`
class SensorSource extends RichParallelSourceFunction[SensorReading] {
// 表示数据源是否正在运行,`true`表示正在运行
var running: Boolean = true
// `run`函数会连续不断的发送`SensorReading`数据
// 使用`SourceContext`来发送数据
override def run(ctx: SourceFunction.SourceContext[SensorReading]): Unit = {
// 初始化随机数发生器,用来产生随机的温度读数
val rand = new Random
// 初始化10个(温度传感器ID,温度读数)元组
// `(1 to 10)`从1遍历到10
var curFTemp = (1 to 10).map(
// 使用高斯噪声产生温度读数
i => ("sensor_" + i, 65 + (rand.nextGaussian() * 20))
)
// 无限循环,产生数据流
while (running) {
// 更新温度
curFTemp = curFTemp.map(t => (t._1, t._2 + (rand.nextGaussian() * 0.5)))
// 获取当前的时间戳,单位是ms
val curTime = Calendar.getInstance.getTimeInMillis
// 调用`SourceContext`的`collect`方法来发射出数据
// Flink的算子向下游发送数据,基本都是`collect`方法
curFTemp.foreach(t => ctx.collect(SensorReading(t._1, curTime, t._2)))
// 100ms发送一次数据
Thread.sleep(300)
}
}
// 当取消任务时,关闭无限循环
override def cancel(): Unit = running = false
}
package test2
import org.apache.flink.streaming.api.scala._
object SourceFromCustomDataSource {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val stream = env
// 添加数据源
.addSource(new SensorSource)
stream.print()
env.execute()
}
}
三 其它
slotSharingGroup() //简历slot组,同一个组内组成一个task
disableChaining() //不让该算子和上下算子组成一个task