Flink行走江湖Operators：底层API-Process Function

我们知道，Flink把API分成了好几层，每一层所能够看到的都各不相同。
最底层的就是Process Function，它能够

访问events
访问状态State（容错，一致性，但是仅仅是在keyed流上）
计时器，也仅仅在keyed流上。

KeyedProcessFunction

下面做一个案例，每经过一段时间，发出最新的keyvalue状态

onTimer是基于ProcessTime时

package flinkscala.ProcessFunction

import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.util.Collector

case class CountwithTimestamp(key:String,count:Long,lastModified:Long)
object processfunctionTest1 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
//    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val datastream = env.socketTextStream("127.0.0.1",9000)
        .map(data => {
            val dataArray = data.split(",")
          (dataArray(0),dataArray(1))
          }
        )
//      .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[(String, String)](Time.seconds(3)) {
//        override def extractTimestamp(element: (String, String)): Long = element._2.toLong * 1000
//      })
      .keyBy(_._1)
      .process(new CountWithTimeoutFunction()).print()
    env.execute()
  }
}

class CountWithTimeoutFunction extends KeyedProcessFunction[String,(String,String),(String,Long)]{

    //状态，保存CountWithTimeStamp
  lazy val state: ValueState[CountwithTimestamp] = getRuntimeContext
    .getState(new ValueStateDescriptor[CountwithTimestamp]("mystate",classOf[CountwithTimestamp]))


  override def processElement(value: (_root_.scala.Predef.String, _root_.scala.Predef.String), ctx: _root_.org.apache.flink.streaming.api.functions.KeyedProcessFunction[_root_.scala.Predef.String, (_root_.scala.Predef.String, _root_.scala.Predef.String), (_root_.scala.Predef.String, Long)]#Context, out: _root_.org.apache.flink.util.Collector[(_root_.scala.Predef.String, Long)]): Unit = {
//    System.out.println(value)
    val currentprocesstime = ctx.timerService().currentProcessingTime()
    val current= state.value() match {
      case null =>
        CountwithTimestamp(value._1, 1, currentprocesstime)
      case CountwithTimestamp(key, count, lastModified) =>
        CountwithTimestamp(key, count + 1, currentprocesstime)
    }
    state.update(current)
//    ctx.timerService().registerEventTimeTimer(current.lastModified+6000)
    System.out.println("currentProcessingTime:>>>>>"+currentprocesstime)
    System.out.println("currentWaterMark>>>>>>"+ctx.timerService().currentWatermark())
    System.out.println("ctx.timestamp>>>>>>"+ctx.timestamp())
    //注册定时器
    ctx.timerService().registerProcessingTimeTimer(currentprocesstime+1000)
  }
  override def onTimer(timestamp: Long, ctx: _root_.org.apache.flink.streaming.api.functions.KeyedProcessFunction[_root_.scala.Predef.String, (_root_.scala.Predef.String, _root_.scala.Predef.String), (_root_.scala.Predef.String, Long)]#OnTimerContext, out: _root_.org.apache.flink.util.Collector[(_root_.scala.Predef.String, Long)]): Unit = {
    System.out.println("计时器到期了")
    state.value() match {
      case CountwithTimestamp(key,count,lastModified)
        if(timestamp == lastModified + 1000) => out.collect((timestamp.toString,lastModified))
      case _ =>
    }
  }
}

输入：

运行结果：

可以看到，一共只有两个计时器到期了，因为我们可以看到上面输出的currentPrcessingTimer是只有两种，第一个是一种，而后面三个是相同的，一个计时器只针对时间戳决定，多个一样的时间戳只会出现一个计时器。

第一个计时器
处理时间：1583331466746
计时器到期时间：1583331466746+1000
第二个计时器
处理时间：1583331466761
计时器到期时间：1583331466761+1000

**上面都是基于ProcessingTime的那么如果将其换成EventTime呢

EventTime

package flinkscala.ProcessFunction

import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time

object processfunctionTest2 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val datastream = env.socketTextStream("127.0.0.1",9000)
      .map(data => {
        val dataArray = data.split(",")
        (dataArray(0),dataArray(1))
      }
      )
            .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[(String, String)](Time.seconds(3)) {
              override def extractTimestamp(element: (String, String)): Long = element._2.toLong * 1000
            })
      .keyBy(_._1)
      .process(new CountWithTimeoutFunction2()).print()
    env.execute()
  }
}

class CountWithTimeoutFunction2 extends KeyedProcessFunction[String,(String,String),(String,Long)]{

  //状态，保存CountWithTimeStamp
  lazy val state: ValueState[CountwithTimestamp] = getRuntimeContext
    .getState(new ValueStateDescriptor[CountwithTimestamp]("mystate",classOf[CountwithTimestamp]))


  override def processElement(value: (_root_.scala.Predef.String, _root_.scala.Predef.String), ctx: _root_.org.apache.flink.streaming.api.functions.KeyedProcessFunction[_root_.scala.Predef.String, (_root_.scala.Predef.String, _root_.scala.Predef.String), (_root_.scala.Predef.String, Long)]#Context, out: _root_.org.apache.flink.util.Collector[(_root_.scala.Predef.String, Long)]): Unit = {
    //    System.out.println(value)
    val currentEventtime = ctx.timestamp()
    val current= state.value() match {
      case null =>
        CountwithTimestamp(value._1, 1, currentEventtime)
      case CountwithTimestamp(key, count, lastModified) =>
        CountwithTimestamp(key, count + 1, currentEventtime)
    }
    state.update(current)
    //    ctx.timerService().registerEventTimeTimer(current.lastModified+6000)
    System.out.println("currentEventTime:>>>>>"+currentEventtime)
    System.out.println("currentWaterMark>>>>>>"+ctx.timerService().currentWatermark())
    System.out.println("ctx.timestamp>>>>>>"+ctx.timestamp())
    //注册定时器
    ctx.timerService().registerEventTimeTimer(currentEventtime+1000)
  }
  override def onTimer(timestamp: Long, ctx: _root_.org.apache.flink.streaming.api.functions.KeyedProcessFunction[_root_.scala.Predef.String, (_root_.scala.Predef.String, _root_.scala.Predef.String), (_root_.scala.Predef.String, Long)]#OnTimerContext, out: _root_.org.apache.flink.util.Collector[(_root_.scala.Predef.String, Long)]): Unit = {
    System.out.println(timestamp+":计时器到期了"+state.value())
    state.value() match {
      case CountwithTimestamp(key,count,lastModified)
        if(timestamp >= lastModified) => out.collect((timestamp.toString,lastModified))
      case _ => System.out.println("什么都没有！！！！")
    }
  }
}

运行一下：

这里用了watermark，是对于EventTime*1000并且延迟了3秒钟，所以我们看到的watermark都会比实际EventTime小3000毫秒。
在输入a,14后，这时候的watermark应该是11000毫秒，但是输出的是10000毫秒
这是因为，watermark夹在中间了，比如我们输入11的时候，它前面携带的watermark却是7000，这个7000是因为processElement还没有执行完，对应的watermark还没有更新，所以在这个方法里看还是上次的watermark，但是一旦执行完，它就会更新到最新的，比如在14000的时候，我们看到的还是10000，但是一旦执行完了processElement方法，就会更新watermark为11000，这时候就认为11000之前的数据都到了，那么就会触发第一个定时器，因为它是11000的时候到期，所以被计时器生效了。NICE！！！

posted @ 2020-03-04 17:41 ongbo 阅读(59) 评论(0) 收藏举报

刷新页面返回顶部

ongbo

Flink行走江湖Operators：底层API-Process Function

KeyedProcessFunction

下面做一个案例，每经过一段时间，发出最新的keyvalue状态

onTimer是基于ProcessTime时

EventTime

公告