学习笔记五

  1. Structured Streaming 核心概念(实时处理新世界)
    流式计算基础概念
    scala
    import org.apache.spark.sql.streaming.{OutputMode, Trigger, StreamingQuery}
    import org.apache.spark.sql.types._

// 流式DataFrame vs 静态DataFrame
// 静态:一次性读取全部数据
val staticDF = spark.read.json("data/static/*.json")

// 流式:持续读取新数据
val streamingDF = spark.readStream
.schema(defineSchema()) // 流式数据必须指定schema
.json("data/streaming/")
第一个流式程序:WordCount实时版
scala
object StreamingWordCount {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Streaming WordCount")
.master("local[*]")
.config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoint")
.getOrCreate()

import spark.implicits._

// 1. 创建输入流(从socket读取)
val lines = spark.readStream
  .format("socket")
  .option("host", "localhost")
  .option("port", 9999)
  .load()
  .as[String]

// 2. 数据处理(和批处理一样的API!)
val wordCounts = lines
  .flatMap(_.split(" "))
  .groupBy("value")
  .count()

// 3. 输出流配置
val query = wordCounts.writeStream
  .outputMode("complete")  // complete, append, update
  .format("console")
  .trigger(Trigger.ProcessingTime("5 seconds"))  // 5秒触发一次
  .option("truncate", "false")
  .start()

// 4. 等待终止
query.awaitTermination()

}
}

// 测试命令:nc -lk 9999
// 输入:hello world hello spark
2. 流式处理核心机制
事件时间 vs 处理时间
scala
// 准备带事件时间的流数据
case class Event(eventId: String, eventTime: Timestamp, value: Int, userId: String)

val eventStream = spark.readStream
.schema(StructType(Array(
StructField("eventId", StringType),
StructField("eventTime", TimestampType),
StructField("value", IntegerType),
StructField("userId", StringType)
)))
.json("data/events/")

// 1. 处理时间窗口(不推荐)
val processingTimeWindow = eventStream
.groupBy(window(current_timestamp(), "10 minutes"))
.agg(avg("value"))

// 2. 事件时间窗口(推荐,能处理延迟数据)
import org.apache.spark.sql.functions._

val eventTimeWindow = eventStream
.withWatermark("eventTime", "10 minutes") // 允许10分钟延迟
.groupBy(
window($"eventTime", "10 minutes", "5 minutes"), // 10分钟窗口,5分钟滑动
$"userId"
)
.agg(
sum("value") as "total_value",
count("eventId") as "event_count",
avg("value") as "avg_value"
)
三种输出模式详解
scala
// 1. Complete Mode - 输出全部结果(适用于聚合查询)
val completeQuery = wordCounts.writeStream
.outputMode("complete")
.format("console")
.start()

// 2. Append Mode - 只输出新增行(适用于过滤/投影)
val appendQuery = eventStream
.filter($"value" > 100)
.writeStream
.outputMode("append")
.format("json")
.option("path", "output/events/")
.option("checkpointLocation", "/tmp/checkpoint/append")
.start()

// 3. Update Mode - 只输出更新的行(适用于部分聚合)
val updateQuery = eventStream
.groupBy("userId")
.agg(sum("value") as "total")
.writeStream
.outputMode("update")
.format("console")
.start()
3. 流式数据处理实战:实时用户行为分析
scala
object RealTimeUserAnalytics {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Real-Time User Analytics")
.master("local[*]")
.config("spark.sql.streaming.schemaInference", "true")
.config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoint")
.getOrCreate()

import spark.implicits._
import org.apache.spark.sql.functions._

// 1. 定义数据模型
case class UserAction(
  userId: String,
  actionType: String,
  pageId: String,
  productId: Option[String],
  timestamp: Timestamp,
  sessionId: String,
  value: Double
)

// 2. 读取Kafka数据源(生产环境常用)
val kafkaStream = spark.readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "localhost:9092")
  .option("subscribe", "user-actions")
  .option("startingOffsets", "latest")
  .load()

// 3. 解析JSON数据
val actionStream = kafkaStream
  .selectExpr("CAST(value AS STRING) as json")
  .select(from_json($"json", schemaOf[UserAction]).as("data"))
  .select("data.*")
  .withWatermark("timestamp", "10 minutes")  // 处理延迟数据

// 4. 实时分析1:活跃用户统计
println("=== 实时活跃用户统计 ===")
val activeUsers = actionStream
  .groupBy(
    window($"timestamp", "5 minutes"),
    $"userId"
  )
  .agg(count("*") as "action_count")
  .filter($"action_count" > 5)  // 活跃用户阈值

activeUsers.writeStream
  .outputMode("complete")
  .trigger(Trigger.ProcessingTime("1 minute"))
  .foreachBatch { (batchDF: DataFrame, batchId: Long) =>
    println(s"Batch $batchId 活跃用户:")
    batchDF.show()
    
    // 可以写入Redis/MySQL等外部存储
    batchDF.write
      .mode("overwrite")
      .json(s"output/active_users/batch_$batchId")
  }
  .start()

// 5. 实时分析2:页面实时PV/UV
val pageStats = actionStream
  .groupBy(
    window($"timestamp", "1 minute"),
    $"pageId"
  )
  .agg(
    count("*") as "pv",
    countDistinct("userId") as "uv",
    avg("value") as "avg_value"
  )

pageStats.writeStream
  .outputMode("complete")
  .format("console")
  .option("truncate", "false")
  .trigger(Trigger.ProcessingTime("10 seconds"))
  .start()

// 6. 实时分析3:用户会话分析
val sessionAnalysis = actionStream
  .groupBy(
    $"sessionId",
    $"userId",
    window($"timestamp", "30 minutes")
  )
  .agg(
    min("timestamp") as "session_start",
    max("timestamp") as "session_end",
    collect_list("actionType") as "actions",
    sum("value") as "session_value",
    count("*") as "action_count"
  )
  .withColumn("session_duration", 
    unix_timestamp($"session_end") - unix_timestamp($"session_start"))

sessionAnalysis.writeStream
  .outputMode("update")
  .trigger(Trigger.ProcessingTime("30 seconds"))
  .foreachBatch { (batchDF: DataFrame, batchId: Long) =>
    // 写入分析结果到数据库
    batchDF.select(
      $"sessionId", $"userId", 
      $"session_duration", $"session_value", 
      $"action_count"
    ).write
      .mode("append")
      .jdbc("jdbc:mysql://localhost:3306/analytics", 
            "sessions", 
            connectionProperties)
  }
  .start()

// 7. 实时监控和告警
val alertStream = actionStream
  .groupBy(window($"timestamp", "1 minute"), $"userId")
  .agg(sum("value") as "total_value")
  .filter($"total_value" > 10000)  // 异常行为检测

alertStream.writeStream
  .outputMode("update")
  .foreach { row: Row =>
    val userId = row.getString(1)
    val totalValue = row.getDouble(2)
    sendAlert(userId, totalValue)  // 发送告警
  }
  .start()

// 8. 状态管理和监控
spark.streams.active.foreach { query =>
  println(s"Query: ${query.name}")
  println(s"Status: ${query.status}")
  println(s"Progress: ${query.lastProgress}")
}

spark.streams.awaitAnyTermination()

}

def sendAlert(userId: String, value: Double): Unit = {
println(s"⚠️ ALERT: User $userId abnormal activity: $value")
// 实际发送邮件/短信等
}
}

posted @ 2026-03-09 20:07  Lomook  阅读(2)  评论(0)    收藏  举报