Spark Streaming之dataset实例

  Spark Streaming是核心Spark API的扩展,可实现实时数据流的可扩展,高吞吐量,容错流处理。

  bin/spark-submit --class Streaming /home/wx/Stream.jar
  hadoop fs -put /home/wx/123.txt /user/wx/

文本123.txt

NOTICE:07-26 logId[0072]
NOTICE:07-26 logId[0073]
NOTICE:07-26 logId[0074]
NOTICE:07-26 logId[0075]
NOTICE:07-26 logId[0076]

 

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.streaming.StreamingContext._
import org.apache.spark.sql.SparkSession

object Streaming {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[2]").setAppName("RegexpExtract")
    val ssc = new StreamingContext(conf, Seconds(1))

    println("hello world")

    val lines = ssc.textFileStream("hdfs://name-ha/user/wx/")

    val ds = lines.flatMap(_.split("\n"))

    ds.print()

    ds.foreachRDD { rdd =>

      // Get the singleton instance of SparkSession
      val spark = SparkSession.builder.config(rdd.sparkContext.getConf).getOrCreate()
      import spark.implicits._

      // Convert RDD[String] to DataFrame
      val wordsDataFrame = rdd.toDF("str_col")

      // Create a temporary view
      wordsDataFrame.createOrReplaceTempView("df")

      // Do word count on DataFrame using SQL and print it
      val wordCountsDataFrame =
        spark.sql(raw"""
          select str_col,
          regexp_extract(str_col,"NOTICE:\\d{2}",0) notice,
          regexp_extract(str_col,"logId\\[(.*?)\\]",0) logId 
          from df""")
      wordCountsDataFrame.show(false)
    }

    ssc.start() // Start the computation
    ssc.awaitTermination() // Wait for the computation to terminate
  }
}

 

执行结果

hello world
-------------------------------------------
Time: 1501501752000 ms
-------------------------------------------

NOTICE:07-26 logId[0072]
NOTICE:07-26 logId[0073]
NOTICE:07-26 logId[0074]
NOTICE:07-26 logId[0075]
NOTICE:07-26 logId[0076]

+------------------------+---------+-----------+
|str_col                 |notice   |logId      |
+------------------------+---------+-----------+
|NOTICE:07-26 logId[0072]|NOTICE:07|logId[0072]|
|NOTICE:07-26 logId[0073]|NOTICE:07|logId[0073]|
|NOTICE:07-26 logId[0074]|NOTICE:07|logId[0074]|
|NOTICE:07-26 logId[0075]|NOTICE:07|logId[0075]|
|NOTICE:07-26 logId[0076]|NOTICE:07|logId[0076]|
+------------------------+---------+-----------+

-------------------------------------------
Time: 1501501770000 ms
-------------------------------------------

 

posted @ 2017-07-31 19:59  智能先行者  阅读(1313)  评论(0编辑  收藏  举报