词频统计

  一个文件中统计每个单词出现的频率,分隔符为\t

Java实现

    public static void streamSource(String path) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        DataStreamSource<String> streamSource = env.readTextFile(path);

        streamSource.flatMap((String value, Collector<String> out) -> {
            Arrays.stream(value.split("\t")).forEach(out::collect);
        }).returns(Types.STRING)
                .map(word -> Tuple2.of(word, 1))
                .returns(Types.TUPLE(Types.STRING, Types.INT))
                .keyBy(t -> t.f0)
                .sum(1)
                .print();
        env.execute("word count");
    }

 

public class StreamingJob {

    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        DataStreamSource<String> text = env.socketTextStream("192.168.21.128", 9999);
        text.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
            @Override
            public void flatMap(String value, Collector<Tuple2<String, Integer>> collector) throws Exception {
                String[] tokens = value.toLowerCase().split(",");
                for (String token : tokens) {
                    if (token.length() > 0) {
                        collector.collect(new Tuple2<String, Integer>(token, 1));
                    }
                }
            }
        }).keyBy(0).timeWindow(Time.seconds(5)).sum(1).print();

        env.execute("StreamingJob");
    }
}

测试:

Scala实现

object CountWord {

  def main(args: Array[String]): Unit = {
    val path = "D:/flink/data/hello.txt"

    val env = ExecutionEnvironment.getExecutionEnvironment

    val data = env.readTextFile(path)

    import org.apache.flink.streaming.api.scala._

    data.flatMap(_.toLowerCase.split("\t"))
      .filter(_.nonEmpty)
      .map((_,1))
      .groupBy(0)
      .sum(1).print()
  }

}

 流处理scala

object WordCountStreaming {

  def main(args: Array[String]): Unit = {

    val senv = StreamExecutionEnvironment.getExecutionEnvironment
    val text = senv.socketTextStream("192.168.21.128", 9999)

    implicit val typeInfo = TypeInformation.of(classOf[String])

    import org.apache.flink.streaming.api.scala._

    text.flatMap(_.split(","))
      .filter(_.nonEmpty)
      .map((_, 1))
      .keyBy(0)
      .timeWindow(Time.seconds(3))
      .sum(1)
      .print()
    senv.execute("Streaming WordCount")
  }

}

 

 
posted on 2021-08-29 19:50  溪水静幽  阅读(212)  评论(0)    收藏  举报