2025/1/14
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.graphx._
// 初始化Spark环境
val conf = new SparkConf().setAppName("Summary Example").setMaster("local")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(conf, Seconds(5))
val spark = SparkSession.builder.config(conf).getOrCreate()
// 实时数据流处理
val lines = ssc.socketTextStream("localhost", 9999)
val words = lines.flatMap(_.split(" "))
val wordCounts = words.map(word => (word, 1)).reduceByKey(_ + _)
wordCounts.foreachRDD { rdd =>
val df = spark.createDataFrame(rdd).toDF("word", "count")
val lr = new LogisticRegression()
val model = lr.fit(df)
model.transform(df).show()
}
// 图计算
val vertices: RDD[(VertexId, String)] = spark.sparkContext.parallelize(Array((1L, "Alice"), (2L, "Bob")))
val edges: RDD[Edge[String]] = spark.sparkContext.parallelize(Array(Edge(1L, 2L, "Friend")))
val graph = Graph(vertices, edges)
val ranks = graph.pageRank(0.0001).vertices
ranks.collect().foreach { case (id, rank) => println(s"$id has rank: $rank") }
// 启动流处理
ssc.start