学习笔记五
- Structured Streaming 核心概念(实时处理新世界)
流式计算基础概念
scala
import org.apache.spark.sql.streaming.{OutputMode, Trigger, StreamingQuery}
import org.apache.spark.sql.types._
// 流式DataFrame vs 静态DataFrame
// 静态:一次性读取全部数据
val staticDF = spark.read.json("data/static/*.json")
// 流式:持续读取新数据
val streamingDF = spark.readStream
.schema(defineSchema()) // 流式数据必须指定schema
.json("data/streaming/")
第一个流式程序:WordCount实时版
scala
object StreamingWordCount {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Streaming WordCount")
.master("local[*]")
.config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoint")
.getOrCreate()
import spark.implicits._
// 1. 创建输入流(从socket读取)
val lines = spark.readStream
.format("socket")
.option("host", "localhost")
.option("port", 9999)
.load()
.as[String]
// 2. 数据处理(和批处理一样的API!)
val wordCounts = lines
.flatMap(_.split(" "))
.groupBy("value")
.count()
// 3. 输出流配置
val query = wordCounts.writeStream
.outputMode("complete") // complete, append, update
.format("console")
.trigger(Trigger.ProcessingTime("5 seconds")) // 5秒触发一次
.option("truncate", "false")
.start()
// 4. 等待终止
query.awaitTermination()
}
}
// 测试命令:nc -lk 9999
// 输入:hello world hello spark
2. 流式处理核心机制
事件时间 vs 处理时间
scala
// 准备带事件时间的流数据
case class Event(eventId: String, eventTime: Timestamp, value: Int, userId: String)
val eventStream = spark.readStream
.schema(StructType(Array(
StructField("eventId", StringType),
StructField("eventTime", TimestampType),
StructField("value", IntegerType),
StructField("userId", StringType)
)))
.json("data/events/")
// 1. 处理时间窗口(不推荐)
val processingTimeWindow = eventStream
.groupBy(window(current_timestamp(), "10 minutes"))
.agg(avg("value"))
// 2. 事件时间窗口(推荐,能处理延迟数据)
import org.apache.spark.sql.functions._
val eventTimeWindow = eventStream
.withWatermark("eventTime", "10 minutes") // 允许10分钟延迟
.groupBy(
window($"eventTime", "10 minutes", "5 minutes"), // 10分钟窗口,5分钟滑动
$"userId"
)
.agg(
sum("value") as "total_value",
count("eventId") as "event_count",
avg("value") as "avg_value"
)
三种输出模式详解
scala
// 1. Complete Mode - 输出全部结果(适用于聚合查询)
val completeQuery = wordCounts.writeStream
.outputMode("complete")
.format("console")
.start()
// 2. Append Mode - 只输出新增行(适用于过滤/投影)
val appendQuery = eventStream
.filter($"value" > 100)
.writeStream
.outputMode("append")
.format("json")
.option("path", "output/events/")
.option("checkpointLocation", "/tmp/checkpoint/append")
.start()
// 3. Update Mode - 只输出更新的行(适用于部分聚合)
val updateQuery = eventStream
.groupBy("userId")
.agg(sum("value") as "total")
.writeStream
.outputMode("update")
.format("console")
.start()
3. 流式数据处理实战:实时用户行为分析
scala
object RealTimeUserAnalytics {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Real-Time User Analytics")
.master("local[*]")
.config("spark.sql.streaming.schemaInference", "true")
.config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoint")
.getOrCreate()
import spark.implicits._
import org.apache.spark.sql.functions._
// 1. 定义数据模型
case class UserAction(
userId: String,
actionType: String,
pageId: String,
productId: Option[String],
timestamp: Timestamp,
sessionId: String,
value: Double
)
// 2. 读取Kafka数据源(生产环境常用)
val kafkaStream = spark.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "localhost:9092")
.option("subscribe", "user-actions")
.option("startingOffsets", "latest")
.load()
// 3. 解析JSON数据
val actionStream = kafkaStream
.selectExpr("CAST(value AS STRING) as json")
.select(from_json($"json", schemaOf[UserAction]).as("data"))
.select("data.*")
.withWatermark("timestamp", "10 minutes") // 处理延迟数据
// 4. 实时分析1:活跃用户统计
println("=== 实时活跃用户统计 ===")
val activeUsers = actionStream
.groupBy(
window($"timestamp", "5 minutes"),
$"userId"
)
.agg(count("*") as "action_count")
.filter($"action_count" > 5) // 活跃用户阈值
activeUsers.writeStream
.outputMode("complete")
.trigger(Trigger.ProcessingTime("1 minute"))
.foreachBatch { (batchDF: DataFrame, batchId: Long) =>
println(s"Batch $batchId 活跃用户:")
batchDF.show()
// 可以写入Redis/MySQL等外部存储
batchDF.write
.mode("overwrite")
.json(s"output/active_users/batch_$batchId")
}
.start()
// 5. 实时分析2:页面实时PV/UV
val pageStats = actionStream
.groupBy(
window($"timestamp", "1 minute"),
$"pageId"
)
.agg(
count("*") as "pv",
countDistinct("userId") as "uv",
avg("value") as "avg_value"
)
pageStats.writeStream
.outputMode("complete")
.format("console")
.option("truncate", "false")
.trigger(Trigger.ProcessingTime("10 seconds"))
.start()
// 6. 实时分析3:用户会话分析
val sessionAnalysis = actionStream
.groupBy(
$"sessionId",
$"userId",
window($"timestamp", "30 minutes")
)
.agg(
min("timestamp") as "session_start",
max("timestamp") as "session_end",
collect_list("actionType") as "actions",
sum("value") as "session_value",
count("*") as "action_count"
)
.withColumn("session_duration",
unix_timestamp($"session_end") - unix_timestamp($"session_start"))
sessionAnalysis.writeStream
.outputMode("update")
.trigger(Trigger.ProcessingTime("30 seconds"))
.foreachBatch { (batchDF: DataFrame, batchId: Long) =>
// 写入分析结果到数据库
batchDF.select(
$"sessionId", $"userId",
$"session_duration", $"session_value",
$"action_count"
).write
.mode("append")
.jdbc("jdbc:mysql://localhost:3306/analytics",
"sessions",
connectionProperties)
}
.start()
// 7. 实时监控和告警
val alertStream = actionStream
.groupBy(window($"timestamp", "1 minute"), $"userId")
.agg(sum("value") as "total_value")
.filter($"total_value" > 10000) // 异常行为检测
alertStream.writeStream
.outputMode("update")
.foreach { row: Row =>
val userId = row.getString(1)
val totalValue = row.getDouble(2)
sendAlert(userId, totalValue) // 发送告警
}
.start()
// 8. 状态管理和监控
spark.streams.active.foreach { query =>
println(s"Query: ${query.name}")
println(s"Status: ${query.status}")
println(s"Progress: ${query.lastProgress}")
}
spark.streams.awaitAnyTermination()
}
def sendAlert(userId: String, value: Double): Unit = {
println(s"⚠️ ALERT: User $userId abnormal activity: $value")
// 实际发送邮件/短信等
}
}

浙公网安备 33010602011771号