Flink行走江湖Operators:底层API-Process Function

我们知道,Flink把API分成了好几层,每一层所能够看到的都各不相同。
最底层的就是Process Function,它能够

  • 访问events
  • 访问状态State(容错,一致性,但是仅仅是在keyed流上)
  • 计时器,也仅仅在keyed流上。

KeyedProcessFunction

下面做一个案例,每经过一段时间,发出最新的keyvalue状态

onTimer是基于ProcessTime时

package flinkscala.ProcessFunction

import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.util.Collector

case class CountwithTimestamp(key:String,count:Long,lastModified:Long)
object processfunctionTest1 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
//    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val datastream = env.socketTextStream("127.0.0.1",9000)
        .map(data => {
            val dataArray = data.split(",")
          (dataArray(0),dataArray(1))
          }
        )
//      .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[(String, String)](Time.seconds(3)) {
//        override def extractTimestamp(element: (String, String)): Long = element._2.toLong * 1000
//      })
      .keyBy(_._1)
      .process(new CountWithTimeoutFunction()).print()
    env.execute()
  }
}

class CountWithTimeoutFunction extends KeyedProcessFunction[String,(String,String),(String,Long)]{

    //状态,保存CountWithTimeStamp
  lazy val state: ValueState[CountwithTimestamp] = getRuntimeContext
    .getState(new ValueStateDescriptor[CountwithTimestamp]("mystate",classOf[CountwithTimestamp]))


  override def processElement(value: (_root_.scala.Predef.String, _root_.scala.Predef.String), ctx: _root_.org.apache.flink.streaming.api.functions.KeyedProcessFunction[_root_.scala.Predef.String, (_root_.scala.Predef.String, _root_.scala.Predef.String), (_root_.scala.Predef.String, Long)]#Context, out: _root_.org.apache.flink.util.Collector[(_root_.scala.Predef.String, Long)]): Unit = {
//    System.out.println(value)
    val currentprocesstime = ctx.timerService().currentProcessingTime()
    val current= state.value() match {
      case null =>
        CountwithTimestamp(value._1, 1, currentprocesstime)
      case CountwithTimestamp(key, count, lastModified) =>
        CountwithTimestamp(key, count + 1, currentprocesstime)
    }
    state.update(current)
//    ctx.timerService().registerEventTimeTimer(current.lastModified+6000)
    System.out.println("currentProcessingTime:>>>>>"+currentprocesstime)
    System.out.println("currentWaterMark>>>>>>"+ctx.timerService().currentWatermark())
    System.out.println("ctx.timestamp>>>>>>"+ctx.timestamp())
    //注册定时器
    ctx.timerService().registerProcessingTimeTimer(currentprocesstime+1000)
  }
  override def onTimer(timestamp: Long, ctx: _root_.org.apache.flink.streaming.api.functions.KeyedProcessFunction[_root_.scala.Predef.String, (_root_.scala.Predef.String, _root_.scala.Predef.String), (_root_.scala.Predef.String, Long)]#OnTimerContext, out: _root_.org.apache.flink.util.Collector[(_root_.scala.Predef.String, Long)]): Unit = {
    System.out.println("计时器到期了")
    state.value() match {
      case CountwithTimestamp(key,count,lastModified)
        if(timestamp == lastModified + 1000) => out.collect((timestamp.toString,lastModified))
      case _ =>
    }
  }
}

输入:
在这里插入图片描述
运行结果:
在这里插入图片描述
可以看到,一共只有两个计时器到期了,因为我们可以看到上面输出的currentPrcessingTimer是只有两种,第一个是一种,而后面三个是相同的,一个计时器只针对时间戳决定,多个一样的时间戳只会出现一个计时器。

  • 第一个计时器
    处理时间:1583331466746
    计时器到期时间:1583331466746+1000

  • 第二个计时器
    处理时间:1583331466761
    计时器到期时间:1583331466761+1000

**上面都是基于ProcessingTime的那么如果将其换成EventTime呢

EventTime

package flinkscala.ProcessFunction

import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time

object processfunctionTest2 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val datastream = env.socketTextStream("127.0.0.1",9000)
      .map(data => {
        val dataArray = data.split(",")
        (dataArray(0),dataArray(1))
      }
      )
            .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[(String, String)](Time.seconds(3)) {
              override def extractTimestamp(element: (String, String)): Long = element._2.toLong * 1000
            })
      .keyBy(_._1)
      .process(new CountWithTimeoutFunction2()).print()
    env.execute()
  }
}

class CountWithTimeoutFunction2 extends KeyedProcessFunction[String,(String,String),(String,Long)]{

  //状态,保存CountWithTimeStamp
  lazy val state: ValueState[CountwithTimestamp] = getRuntimeContext
    .getState(new ValueStateDescriptor[CountwithTimestamp]("mystate",classOf[CountwithTimestamp]))


  override def processElement(value: (_root_.scala.Predef.String, _root_.scala.Predef.String), ctx: _root_.org.apache.flink.streaming.api.functions.KeyedProcessFunction[_root_.scala.Predef.String, (_root_.scala.Predef.String, _root_.scala.Predef.String), (_root_.scala.Predef.String, Long)]#Context, out: _root_.org.apache.flink.util.Collector[(_root_.scala.Predef.String, Long)]): Unit = {
    //    System.out.println(value)
    val currentEventtime = ctx.timestamp()
    val current= state.value() match {
      case null =>
        CountwithTimestamp(value._1, 1, currentEventtime)
      case CountwithTimestamp(key, count, lastModified) =>
        CountwithTimestamp(key, count + 1, currentEventtime)
    }
    state.update(current)
    //    ctx.timerService().registerEventTimeTimer(current.lastModified+6000)
    System.out.println("currentEventTime:>>>>>"+currentEventtime)
    System.out.println("currentWaterMark>>>>>>"+ctx.timerService().currentWatermark())
    System.out.println("ctx.timestamp>>>>>>"+ctx.timestamp())
    //注册定时器
    ctx.timerService().registerEventTimeTimer(currentEventtime+1000)
  }
  override def onTimer(timestamp: Long, ctx: _root_.org.apache.flink.streaming.api.functions.KeyedProcessFunction[_root_.scala.Predef.String, (_root_.scala.Predef.String, _root_.scala.Predef.String), (_root_.scala.Predef.String, Long)]#OnTimerContext, out: _root_.org.apache.flink.util.Collector[(_root_.scala.Predef.String, Long)]): Unit = {
    System.out.println(timestamp+":计时器到期了"+state.value())
    state.value() match {
      case CountwithTimestamp(key,count,lastModified)
        if(timestamp >= lastModified) => out.collect((timestamp.toString,lastModified))
      case _ => System.out.println("什么都没有!!!!")
    }
  }
}

运行一下:
在这里插入图片描述
这里用了watermark,是对于EventTime*1000并且延迟了3秒钟,所以我们看到的watermark都会比实际EventTime小3000毫秒。
在输入a,14后,这时候的watermark应该是11000毫秒,但是输出的是10000毫秒
这是因为,watermark夹在中间了,比如我们输入11的时候,它前面携带的watermark却是7000,这个7000是因为processElement还没有执行完,对应的watermark还没有更新,所以在这个方法里看还是上次的watermark,但是一旦执行完,它就会更新到最新的,比如在14000的时候,我们看到的还是10000,但是一旦执行完了processElement方法,就会更新watermark为11000,这时候就认为11000之前的数据都到了,那么就会触发第一个定时器,因为它是11000的时候到期,所以被计时器生效了。NICE!!!

posted @ 2020-03-04 17:41  ongbo  阅读(59)  评论(0)    收藏  举报