学习笔记六

  1. MLlib机器学习基础(开启AI之旅)
    机器学习Pipeline概念
    scala
    import org.apache.spark.ml.{Pipeline, PipelineModel}
    import org.apache.spark.ml.feature._
    import org.apache.spark.ml.classification._
    import org.apache.spark.ml.evaluation._
    import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
    import org.apache.spark.sql.types._

// MLlib核心概念:
// - Transformer:转换器(特征工程)
// - Estimator:估计器(算法模型)
// - Pipeline:工作流(整合所有步骤)
// - Param:参数(模型配置)
第一个机器学习示例:用户分类
scala
object UserClassification {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("User Classification")
.master("local[*]")
.config("spark.sql.adaptive.enabled", "true")
.getOrCreate()

import spark.implicits._

// 1. 准备数据
case class UserData(
  userId: String,
  age: Int,
  income: Double,
  education: String,
  city: String,
  lastLoginDays: Int,
  purchaseCount: Int,
  avgOrderValue: Double,
  label: Int  // 0:普通用户, 1:VIP用户
)

val data = Seq(
  UserData("u1", 25, 8000.0, "本科", "北京", 2, 15, 300.0, 0),
  UserData("u2", 35, 25000.0, "硕士", "上海", 1, 45, 1200.0, 1),
  UserData("u3", 28, 12000.0, "本科", "广州", 5, 20, 450.0, 0),
  UserData("u4", 42, 35000.0, "博士", "北京", 3, 60, 2000.0, 1),
  UserData("u5", 31, 18000.0, "硕士", "深圳", 10, 25, 800.0, 0),
  UserData("u6", 45, 42000.0, "博士", "上海", 1, 80, 3000.0, 1),
  UserData("u7", 22, 5000.0, "大专", "成都", 15, 5, 150.0, 0),
  UserData("u8", 38, 28000.0, "硕士", "杭州", 2, 50, 1500.0, 1)
).toDF()

// 2. 特征工程
// 2.1 分箱处理年龄
val ageBucketizer = new Bucketizer()
  .setInputCol("age")
  .setOutputCol("age_category")
  .setSplits(Array(0, 25, 35, 45, 100))

// 2.2 标准化收入
val incomeScaler = new StandardScaler()
  .setInputCol("income")
  .setOutputCol("scaled_income")
  .setWithStd(true)
  .setWithMean(true)

// 2.3 处理类别特征(教育程度)
val educationIndexer = new StringIndexer()
  .setInputCol("education")
  .setOutputCol("education_index")
  .setHandleInvalid("keep")

// 2.4 处理城市(独热编码)
val cityIndexer = new StringIndexer()
  .setInputCol("city")
  .setOutputCol("city_index")

val cityEncoder = new OneHotEncoder()
  .setInputCol("city_index")
  .setOutputCol("city_vector")

// 2.5 组合数值特征
val assembler = new VectorAssembler()
  .setInputCols(Array(
    "age_category", "scaled_income", "education_index", 
    "city_vector", "lastLoginDays", "purchaseCount", "avgOrderValue"
  ))
  .setOutputCol("features")

// 3. 选择算法(随机森林)
val rf = new RandomForestClassifier()
  .setLabelCol("label")
  .setFeaturesCol("features")
  .setNumTrees(100)
  .setMaxDepth(10)
  .setSeed(42)

// 4. 构建Pipeline
val pipeline = new Pipeline()
  .setStages(Array(
    ageBucketizer,
    incomeScaler,
    educationIndexer,
    cityIndexer,
    cityEncoder,
    assembler,
    rf
  ))

// 5. 划分训练集和测试集
val Array(trainingData, testData) = data.randomSplit(Array(0.8, 0.2), seed = 42)

// 6. 训练模型
println("开始训练模型...")
val model = pipeline.fit(trainingData)

// 7. 预测
val predictions = model.transform(testData)

println("\n=== 预测结果 ===")
predictions.select("userId", "label", "prediction", "probability")
  .show(false)

// 8. 评估模型
val evaluator = new MulticlassClassificationEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("accuracy")

val accuracy = evaluator.evaluate(predictions)
println(s"\n模型准确率: ${accuracy * 100}%.2f")

// 9. 特征重要性分析
val rfModel = model.stages.last.asInstanceOf[RandomForestClassificationModel]
println("\n=== 特征重要性 ===")
val featureNames = Array("年龄分类", "收入", "教育", "城市", "最近登录", "购买次数", "平均订单")
rfModel.featureImportances.toArray.zip(featureNames)
  .sortBy(-_._1)
  .foreach { case (importance, name) =>
    println(f"$name: ${importance * 100}%.2f%%")
  }

spark.stop()

}
}
2. 常用机器学习算法实战
2.1 协同过滤推荐系统
scala
object MovieRecommendation {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Movie Recommendation")
.master("local[*]")
.getOrCreate()

import spark.implicits._

// 1. 读取评分数据(用户ID,电影ID,评分,时间戳)
val ratingsDF = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("data/ratings.csv")
  .select("userId", "movieId", "rating")

println("评分数据统计:")
ratingsDF.describe("rating").show()

// 2. 使用ALS算法进行协同过滤
import org.apache.spark.ml.recommendation.ALS

val als = new ALS()
  .setMaxIter(10)
  .setRank(10)  // 隐式因子个数
  .setRegParam(0.1)  // 正则化参数
  .setUserCol("userId")
  .setItemCol("movieId")
  .setRatingCol("rating")
  .setColdStartStrategy("drop")  // 冷启动策略

// 3. 划分训练集和测试集
val Array(training, test) = ratingsDF.randomSplit(Array(0.8, 0.2))

// 4. 训练模型
val model = als.fit(training)

// 5. 评估模型(使用RMSE)
val predictions = model.transform(test)

import org.apache.ml.evaluation.RegressionEvaluator
val evaluator = new RegressionEvaluator()
  .setMetricName("rmse")
  .setLabelCol("rating")
  .setPredictionCol("prediction")

val rmse = evaluator.evaluate(predictions)
println(s"模型RMSE: $rmse")

// 6. 为用户推荐电影
val userRecs = model.recommendForAllUsers(5)  // 为每个用户推荐5部电影
println("\n=== 用户推荐结果 ===")
userRecs.show(5, false)

// 7. 为电影推荐用户
val movieRecs = model.recommendForAllItems(5)  // 为每部电影推荐5个用户
println("\n=== 电影推荐用户 ===")
movieRecs.show(5, false)

// 8. 查看特定用户的推荐
val targetUser = 123
val userRecommendations = model.recommendForAllItems(10)
  .filter($"userId" === targetUser)

println(s"\n用户 $targetUser 的推荐电影:")
userRecommendations.show(false)

spark.stop()

}
}
2.2 K-Means聚类分析
scala
object CustomerSegmentation {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Customer Segmentation")
.master("local[*]")
.getOrCreate()

import spark.implicits._
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.feature.VectorAssembler

// 1. 准备客户数据(RFM特征)
case class CustomerRFM(
  customerId: String,
  recency: Int,      // 最近购买天数
  frequency: Int,    // 购买频率
  monetary: Double,  // 消费金额
  avgOrderValue: Double,
  categoryCount: Int  // 购买品类数
)

val customerData = Seq(
  CustomerRFM("c001", 5, 20, 5000.0, 250.0, 8),
  CustomerRFM("c002", 30, 5, 800.0, 160.0, 3),
  CustomerRFM("c003", 2, 50, 15000.0, 300.0, 15),
  CustomerRFM("c004", 45, 3, 450.0, 150.0, 2),
  CustomerRFM("c005", 8, 15, 3500.0, 233.0, 6),
  CustomerRFM("c006", 60, 2, 200.0, 100.0, 1),
  CustomerRFM("c007", 3, 35, 12000.0, 342.0, 12),
  CustomerRFM("c008", 20, 8, 1800.0, 225.0, 4)
).toDF()

// 2. 特征工程
val assembler = new VectorAssembler()
  .setInputCols(Array("recency", "frequency", "monetary", "avgOrderValue", "categoryCount"))
  .setOutputCol("features")

val featureDF = assembler.transform(customerData)

// 3. 特征标准化(重要!)
import org.apache.spark.ml.feature.StandardScaler

val scaler = new StandardScaler()
  .setInputCol("features")
  .setOutputCol("scaled_features")
  .setWithStd(true)
  .setWithMean(true)

val scalerModel = scaler.fit(featureDF)
val scaledDF = scalerModel.transform(featureDF)

// 4. 使用肘部法则确定最佳K值
println("=== 肘部法则分析 ===")
val ks = Array(2, 3, 4, 5, 6)
val costs = ks.map { k =>
  val kmeans = new KMeans()
    .setK(k)
    .setSeed(42)
    .setFeaturesCol("scaled_features")
    .setPredictionCol("prediction")
  
  val model = kmeans.fit(scaledDF)
  val cost = model.computeCost(scaledDF)
  println(s"K=$k, 误差平方和: $cost")
  (k, cost)
}

// 5. 选择K=3进行聚类
val kmeans = new KMeans()
  .setK(3)
  .setSeed(42)
  .setFeaturesCol("scaled_features")
  .setPredictionCol("segment")
  .setMaxIter(20)

val model = kmeans.fit(scaledDF)

// 6. 预测客户群体
val segmented = model.transform(scaledDF)
  .select("customerId", "recency", "frequency", "monetary", "segment")

println("\n=== 客户分群结果 ===")
segmented.show()

// 7. 分析各群体特征
println("\n=== 各群体特征分析 ===")
segmented.groupBy("segment")
  .agg(
    avg("recency").alias("avg_recency"),
    avg("frequency").alias("avg_frequency"),
    avg("monetary").alias("avg_monetary"),
    count("customerId").alias("customer_count")
  )
  .orderBy("segment")
  .show()

// 8. 聚类中心解读
println("\n=== 聚类中心 ===")
model.clusterCenters.foreach { center =>
  println(s"中心点: ${center.toArray.mkString(", ")}")
}

spark.stop()

}
}
3. GraphX图计算入门
图数据结构与操作
scala
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

object SocialNetworkAnalysis {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Social Network Analysis")
.master("local[*]")
.getOrCreate()

val sc = spark.sparkContext

// 1. 创建图的顶点(Vertex)
// 顶点格式: (vertexId, (name, age, city))
val vertices: RDD[(VertexId, (String, Int, String))] = sc.parallelize(Array(
  (1L, ("Alice", 28, "北京")),
  (2L, ("Bob", 32, "上海")),
  (3L, ("Charlie", 35, "北京")),
  (4L, ("David", 25, "广州")),
  (5L, ("Eve", 30, "深圳")),
  (6L, ("Frank", 40, "北京"))
))

// 2. 创建图的边(Edge)
// 边格式: Edge(srcId, dstId, relationship)
val edges: RDD[Edge[String]] = sc.parallelize(Array(
  Edge(1L, 2L, "朋友"),
  Edge(1L, 3L, "同事"),
  Edge(2L, 3L, "朋友"),
  Edge(3L, 4L, "家人"),
  Edge(4L, 5L, "朋友"),
  Edge(5L, 6L, "同事"),
  Edge(2L, 5L, "朋友"),
  Edge(1L, 6L, "朋友")
))

// 3. 构建图
val graph = Graph(vertices, edges)

println("=== 图基本信息 ===")
println(s"顶点数: ${graph.vertices.count()}")
println(s"边数: ${graph.edges.count()}")

// 4. 图的基本操作
println("\n=== 所有顶点 ===")
graph.vertices.collect().foreach { case (id, (name, age, city)) =>
  println(s"ID: $id, 姓名: $name, 年龄: $age, 城市: $city")
}

println("\n=== 所有边 ===")
graph.edges.collect().foreach { edge =>
  println(s"${edge.srcId} -> ${edge.dstId} : ${edge.attr}")
}

// 5. 度数统计
println("\n=== 度数统计 ===")
val degrees = graph.degrees
degrees.collect().sortBy(-_._2).foreach { case (id, degree) =>
  val name = graph.vertices.filter(v => v._1 == id).first()._2._1
  println(s"$name: $degree 个连接")
}

// 6. PageRank算法 - 找重要节点
println("\n=== PageRank重要性排名 ===")
val ranks = graph.pageRank(0.0001).vertices
val rankByName = vertices.join(ranks).map {
  case (id, ((name, _, _), rank)) => (name, rank)
}
rankByName.collect().sortBy(-_._2).foreach { case (name, rank) =>
  println(s"$name: $rank")
}

// 7. 连通组件 - 找社交圈子
println("\n=== 连通组件分析 ===")
val connectedComponents = graph.connectedComponents().vertices
val componentGroups = connectedComponents.join(vertices).map {
  case (vertexId, (componentId, (name, _, _))) => (componentId, name)
}.groupByKey()

componentGroups.collect().foreach { case (componentId, members) =>
  println(s"组件 $componentId: ${members.mkString(", ")}")
}

// 8. 三角形计数 - 衡量社交紧密程度
println("\n=== 三角形计数 ===")
val triCounts = graph.triangleCount().vertices
triCounts.join(vertices).map {
  case (id, (count, (name, _, _))) => (name, count)
}.collect().sortBy(-_._2).foreach { case (name, count) =>
  println(s"$name 参与 $count 个三角形")
}

spark.stop()

}
}

posted @ 2026-03-09 20:09  Lomook  阅读(2)  评论(0)    收藏  举报