机器学习任务7
# 导入必要的库
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, classification_report, accuracy_score
# 加载 iris 数据集
iris = datasets.load_iris()
X = iris.data # 特征
y_true = iris.target # 真实标签
# 使用留出法留出 1/3 的样本作为测试集
X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=0.33, random_state=42, stratify=y_true)
print(f"训练集样本数: {len(y_train)}, 测试集样本数: {len(y_test)}")
# 使用 K 均值聚类算法,类别数为 3
kmeans = KMeans(n_clusters=3, random_state=42)
# 训练模型
kmeans.fit(X_train)
# 计算训练集的轮廓系数
train_labels = kmeans.predict(X_train)
train_silhouette = silhouette_score(X_train, train_labels)
print(f"训练集轮廓系数: {train_silhouette:.4f}")
# 使用测试集测试模型性能
y_pred = kmeans.predict(X_test)
# 为聚类结果生成分类报告
# 注意:由于 K 均值聚类的输出与真实标签不一定相同,实际使用中需对聚类结果进行标签映射
report = classification_report(y_test, y_pred, output_dict=True)
# 输出测试集结果
print(f"\n测试集分类报告:\n {classification_report(y_test, y_pred)}")
print(f"准确度: {accuracy_score(y_test, y_pred)}")
print(f"聚类结果:\n{y_pred}")

浙公网安备 33010602011771号