2025/1/24
// 数据导入
val data = spark.read.option("header", "true").csv("data/adult.csv")
// 数据预处理
val assembler = new VectorAssembler()
.setInputCols(Array("age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"))
.setOutputCol("features")
val dataWithFeatures = assembler.transform(data)
// 训练模型
val lr = new LogisticRegression().setLabelCol("label").setFeaturesCol("features")
val Array(trainingData, testData) = dataWithFeatures.randomSplit(Array(0.7, 0.3))
val model = lr.fit(trainingData)
// 预测
val predictions = model.transform(testData)
predictions.select("features", "label", "prediction").show(5)