学习笔记六
- MLlib机器学习基础(开启AI之旅)
机器学习Pipeline概念
scala
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.evaluation._
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
import org.apache.spark.sql.types._
// MLlib核心概念:
// - Transformer:转换器(特征工程)
// - Estimator:估计器(算法模型)
// - Pipeline:工作流(整合所有步骤)
// - Param:参数(模型配置)
第一个机器学习示例:用户分类
scala
object UserClassification {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("User Classification")
.master("local[*]")
.config("spark.sql.adaptive.enabled", "true")
.getOrCreate()
import spark.implicits._
// 1. 准备数据
case class UserData(
userId: String,
age: Int,
income: Double,
education: String,
city: String,
lastLoginDays: Int,
purchaseCount: Int,
avgOrderValue: Double,
label: Int // 0:普通用户, 1:VIP用户
)
val data = Seq(
UserData("u1", 25, 8000.0, "本科", "北京", 2, 15, 300.0, 0),
UserData("u2", 35, 25000.0, "硕士", "上海", 1, 45, 1200.0, 1),
UserData("u3", 28, 12000.0, "本科", "广州", 5, 20, 450.0, 0),
UserData("u4", 42, 35000.0, "博士", "北京", 3, 60, 2000.0, 1),
UserData("u5", 31, 18000.0, "硕士", "深圳", 10, 25, 800.0, 0),
UserData("u6", 45, 42000.0, "博士", "上海", 1, 80, 3000.0, 1),
UserData("u7", 22, 5000.0, "大专", "成都", 15, 5, 150.0, 0),
UserData("u8", 38, 28000.0, "硕士", "杭州", 2, 50, 1500.0, 1)
).toDF()
// 2. 特征工程
// 2.1 分箱处理年龄
val ageBucketizer = new Bucketizer()
.setInputCol("age")
.setOutputCol("age_category")
.setSplits(Array(0, 25, 35, 45, 100))
// 2.2 标准化收入
val incomeScaler = new StandardScaler()
.setInputCol("income")
.setOutputCol("scaled_income")
.setWithStd(true)
.setWithMean(true)
// 2.3 处理类别特征(教育程度)
val educationIndexer = new StringIndexer()
.setInputCol("education")
.setOutputCol("education_index")
.setHandleInvalid("keep")
// 2.4 处理城市(独热编码)
val cityIndexer = new StringIndexer()
.setInputCol("city")
.setOutputCol("city_index")
val cityEncoder = new OneHotEncoder()
.setInputCol("city_index")
.setOutputCol("city_vector")
// 2.5 组合数值特征
val assembler = new VectorAssembler()
.setInputCols(Array(
"age_category", "scaled_income", "education_index",
"city_vector", "lastLoginDays", "purchaseCount", "avgOrderValue"
))
.setOutputCol("features")
// 3. 选择算法(随机森林)
val rf = new RandomForestClassifier()
.setLabelCol("label")
.setFeaturesCol("features")
.setNumTrees(100)
.setMaxDepth(10)
.setSeed(42)
// 4. 构建Pipeline
val pipeline = new Pipeline()
.setStages(Array(
ageBucketizer,
incomeScaler,
educationIndexer,
cityIndexer,
cityEncoder,
assembler,
rf
))
// 5. 划分训练集和测试集
val Array(trainingData, testData) = data.randomSplit(Array(0.8, 0.2), seed = 42)
// 6. 训练模型
println("开始训练模型...")
val model = pipeline.fit(trainingData)
// 7. 预测
val predictions = model.transform(testData)
println("\n=== 预测结果 ===")
predictions.select("userId", "label", "prediction", "probability")
.show(false)
// 8. 评估模型
val evaluator = new MulticlassClassificationEvaluator()
.setLabelCol("label")
.setPredictionCol("prediction")
.setMetricName("accuracy")
val accuracy = evaluator.evaluate(predictions)
println(s"\n模型准确率: ${accuracy * 100}%.2f")
// 9. 特征重要性分析
val rfModel = model.stages.last.asInstanceOf[RandomForestClassificationModel]
println("\n=== 特征重要性 ===")
val featureNames = Array("年龄分类", "收入", "教育", "城市", "最近登录", "购买次数", "平均订单")
rfModel.featureImportances.toArray.zip(featureNames)
.sortBy(-_._1)
.foreach { case (importance, name) =>
println(f"$name: ${importance * 100}%.2f%%")
}
spark.stop()
}
}
2. 常用机器学习算法实战
2.1 协同过滤推荐系统
scala
object MovieRecommendation {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Movie Recommendation")
.master("local[*]")
.getOrCreate()
import spark.implicits._
// 1. 读取评分数据(用户ID,电影ID,评分,时间戳)
val ratingsDF = spark.read
.option("header", "true")
.option("inferSchema", "true")
.csv("data/ratings.csv")
.select("userId", "movieId", "rating")
println("评分数据统计:")
ratingsDF.describe("rating").show()
// 2. 使用ALS算法进行协同过滤
import org.apache.spark.ml.recommendation.ALS
val als = new ALS()
.setMaxIter(10)
.setRank(10) // 隐式因子个数
.setRegParam(0.1) // 正则化参数
.setUserCol("userId")
.setItemCol("movieId")
.setRatingCol("rating")
.setColdStartStrategy("drop") // 冷启动策略
// 3. 划分训练集和测试集
val Array(training, test) = ratingsDF.randomSplit(Array(0.8, 0.2))
// 4. 训练模型
val model = als.fit(training)
// 5. 评估模型(使用RMSE)
val predictions = model.transform(test)
import org.apache.ml.evaluation.RegressionEvaluator
val evaluator = new RegressionEvaluator()
.setMetricName("rmse")
.setLabelCol("rating")
.setPredictionCol("prediction")
val rmse = evaluator.evaluate(predictions)
println(s"模型RMSE: $rmse")
// 6. 为用户推荐电影
val userRecs = model.recommendForAllUsers(5) // 为每个用户推荐5部电影
println("\n=== 用户推荐结果 ===")
userRecs.show(5, false)
// 7. 为电影推荐用户
val movieRecs = model.recommendForAllItems(5) // 为每部电影推荐5个用户
println("\n=== 电影推荐用户 ===")
movieRecs.show(5, false)
// 8. 查看特定用户的推荐
val targetUser = 123
val userRecommendations = model.recommendForAllItems(10)
.filter($"userId" === targetUser)
println(s"\n用户 $targetUser 的推荐电影:")
userRecommendations.show(false)
spark.stop()
}
}
2.2 K-Means聚类分析
scala
object CustomerSegmentation {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Customer Segmentation")
.master("local[*]")
.getOrCreate()
import spark.implicits._
import org.apache.spark.ml.clustering.KMeans
import org.apache.spark.ml.feature.VectorAssembler
// 1. 准备客户数据(RFM特征)
case class CustomerRFM(
customerId: String,
recency: Int, // 最近购买天数
frequency: Int, // 购买频率
monetary: Double, // 消费金额
avgOrderValue: Double,
categoryCount: Int // 购买品类数
)
val customerData = Seq(
CustomerRFM("c001", 5, 20, 5000.0, 250.0, 8),
CustomerRFM("c002", 30, 5, 800.0, 160.0, 3),
CustomerRFM("c003", 2, 50, 15000.0, 300.0, 15),
CustomerRFM("c004", 45, 3, 450.0, 150.0, 2),
CustomerRFM("c005", 8, 15, 3500.0, 233.0, 6),
CustomerRFM("c006", 60, 2, 200.0, 100.0, 1),
CustomerRFM("c007", 3, 35, 12000.0, 342.0, 12),
CustomerRFM("c008", 20, 8, 1800.0, 225.0, 4)
).toDF()
// 2. 特征工程
val assembler = new VectorAssembler()
.setInputCols(Array("recency", "frequency", "monetary", "avgOrderValue", "categoryCount"))
.setOutputCol("features")
val featureDF = assembler.transform(customerData)
// 3. 特征标准化(重要!)
import org.apache.spark.ml.feature.StandardScaler
val scaler = new StandardScaler()
.setInputCol("features")
.setOutputCol("scaled_features")
.setWithStd(true)
.setWithMean(true)
val scalerModel = scaler.fit(featureDF)
val scaledDF = scalerModel.transform(featureDF)
// 4. 使用肘部法则确定最佳K值
println("=== 肘部法则分析 ===")
val ks = Array(2, 3, 4, 5, 6)
val costs = ks.map { k =>
val kmeans = new KMeans()
.setK(k)
.setSeed(42)
.setFeaturesCol("scaled_features")
.setPredictionCol("prediction")
val model = kmeans.fit(scaledDF)
val cost = model.computeCost(scaledDF)
println(s"K=$k, 误差平方和: $cost")
(k, cost)
}
// 5. 选择K=3进行聚类
val kmeans = new KMeans()
.setK(3)
.setSeed(42)
.setFeaturesCol("scaled_features")
.setPredictionCol("segment")
.setMaxIter(20)
val model = kmeans.fit(scaledDF)
// 6. 预测客户群体
val segmented = model.transform(scaledDF)
.select("customerId", "recency", "frequency", "monetary", "segment")
println("\n=== 客户分群结果 ===")
segmented.show()
// 7. 分析各群体特征
println("\n=== 各群体特征分析 ===")
segmented.groupBy("segment")
.agg(
avg("recency").alias("avg_recency"),
avg("frequency").alias("avg_frequency"),
avg("monetary").alias("avg_monetary"),
count("customerId").alias("customer_count")
)
.orderBy("segment")
.show()
// 8. 聚类中心解读
println("\n=== 聚类中心 ===")
model.clusterCenters.foreach { center =>
println(s"中心点: ${center.toArray.mkString(", ")}")
}
spark.stop()
}
}
3. GraphX图计算入门
图数据结构与操作
scala
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
object SocialNetworkAnalysis {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName("Social Network Analysis")
.master("local[*]")
.getOrCreate()
val sc = spark.sparkContext
// 1. 创建图的顶点(Vertex)
// 顶点格式: (vertexId, (name, age, city))
val vertices: RDD[(VertexId, (String, Int, String))] = sc.parallelize(Array(
(1L, ("Alice", 28, "北京")),
(2L, ("Bob", 32, "上海")),
(3L, ("Charlie", 35, "北京")),
(4L, ("David", 25, "广州")),
(5L, ("Eve", 30, "深圳")),
(6L, ("Frank", 40, "北京"))
))
// 2. 创建图的边(Edge)
// 边格式: Edge(srcId, dstId, relationship)
val edges: RDD[Edge[String]] = sc.parallelize(Array(
Edge(1L, 2L, "朋友"),
Edge(1L, 3L, "同事"),
Edge(2L, 3L, "朋友"),
Edge(3L, 4L, "家人"),
Edge(4L, 5L, "朋友"),
Edge(5L, 6L, "同事"),
Edge(2L, 5L, "朋友"),
Edge(1L, 6L, "朋友")
))
// 3. 构建图
val graph = Graph(vertices, edges)
println("=== 图基本信息 ===")
println(s"顶点数: ${graph.vertices.count()}")
println(s"边数: ${graph.edges.count()}")
// 4. 图的基本操作
println("\n=== 所有顶点 ===")
graph.vertices.collect().foreach { case (id, (name, age, city)) =>
println(s"ID: $id, 姓名: $name, 年龄: $age, 城市: $city")
}
println("\n=== 所有边 ===")
graph.edges.collect().foreach { edge =>
println(s"${edge.srcId} -> ${edge.dstId} : ${edge.attr}")
}
// 5. 度数统计
println("\n=== 度数统计 ===")
val degrees = graph.degrees
degrees.collect().sortBy(-_._2).foreach { case (id, degree) =>
val name = graph.vertices.filter(v => v._1 == id).first()._2._1
println(s"$name: $degree 个连接")
}
// 6. PageRank算法 - 找重要节点
println("\n=== PageRank重要性排名 ===")
val ranks = graph.pageRank(0.0001).vertices
val rankByName = vertices.join(ranks).map {
case (id, ((name, _, _), rank)) => (name, rank)
}
rankByName.collect().sortBy(-_._2).foreach { case (name, rank) =>
println(s"$name: $rank")
}
// 7. 连通组件 - 找社交圈子
println("\n=== 连通组件分析 ===")
val connectedComponents = graph.connectedComponents().vertices
val componentGroups = connectedComponents.join(vertices).map {
case (vertexId, (componentId, (name, _, _))) => (componentId, name)
}.groupByKey()
componentGroups.collect().foreach { case (componentId, members) =>
println(s"组件 $componentId: ${members.mkString(", ")}")
}
// 8. 三角形计数 - 衡量社交紧密程度
println("\n=== 三角形计数 ===")
val triCounts = graph.triangleCount().vertices
triCounts.join(vertices).map {
case (id, (count, (name, _, _))) => (name, count)
}.collect().sortBy(-_._2).foreach { case (name, count) =>
println(s"$name 参与 $count 个三角形")
}
spark.stop()
}
}

浙公网安备 33010602011771号