3月18日
进行ai验收项目
完成了聚类和分类算法的代码
from pyspark.sql import SparkSession from pyspark.ml.feature import VectorAssembler from pyspark.ml.clustering import KMeans from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import Pipeline # 初始化Spark会话 spark = SparkSession.builder.appName("MLlibExample").getOrCreate() # 加载数据集(以鸢尾花数据集为例) # 假设数据集包含特征列:sepal_length, sepal_width, petal_length, petal_width 和标签列:label data = spark.read.csv("iris.csv", header=True, inferSchema=True) # 将特征列组合为一个向量 assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features") # 聚类分析:K-Means kmeans = KMeans(featuresCol="features", k=3, seed=42) # 分类分析:随机森林 rf = RandomForestClassifier(featuresCol="features", labelCol="label", seed=42) # 构建Pipeline pipeline = Pipeline(stages=[assembler, kmeans, rf]) # 拆分数据集为训练集和测试集 train_data, test_data = data.randomSplit([0.7, 0.3], seed=42) # 训练模型 model = pipeline.fit(train_data) # 预测 predictions = model.transform(test_data) # 评估分类模型性能 evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") accuracy = evaluator.evaluate(predictions) print(f"Accuracy: {accuracy:.4f}") evaluator.setMetricName("f1") f1_score = evaluator.evaluate(predictions) print(f"F1 Score: {f1_score:.4f}") evaluator.setMetricName("weightedRecall") recall = evaluator.evaluate(predictions) print(f"Weighted Recall: {recall:.4f}") # 评估聚类模型性能(可选) # 由于K-Means是无监督学习,通常使用轮廓系数等指标评估聚类效果 # 这里仅展示分类模型的评估指标 # 停止Spark会话 spark.stop()