K-means load_iris

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
scaler = StandardScaler()
X = scaler.fit_transform(X)
np.random.seed(20)
def kmeans(X, n_clusters, max_iters=100):
    # 初始化聚类中心
####参考教材内容,补全关键代码,重新运行####
    for i in range(max_iters):
        # 计算每个样本到聚类中心的距离
        distances = np.linalg.norm(X[:, None] - centroids, axis=2)
        # 分配样本到最近的聚类中心
####参考教材内容,补全关键代码,重新运行####
        labels=np.argmin(distances,axis=1)
        # 计算新的聚类中心
####参考教材内容,补全关键代码,重新运行####
        new_centroids=np.array([X[labels==j].mean(axis=0) for j in range(n_clusters)])
        # 判断是否收敛
        if np.allclose(new_centroids, centroids):
            break
        centroids = new_centroids
    return centroids, labels
####参考教材内容,补全关键代码,重新运行####
centroids,labels=kmeans(X,n_clusters=3,max_iters=100)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, color='red')
plt.scatter(X[:, 0], X[:, 1], c=labels,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Kmeans')
plt.show()

plt.scatter(X[:, 0], X[:, 1], c=y,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Iris Dataset')
plt.show()

kmedoids load_iris

python实现

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
def distance(x1, x2):
    return np.sum(np.abs(x1 - x2)) #manhattan distance
    #return np.linalg.norm(x1 - x2) 
    
def kmedoids(X, n_clusters, max_iters=100):
    medoids = np.random.choice(len(X), n_clusters, replace=False)
    for _ in range(max_iters):
        clusters = [[] for _ in range(n_clusters)]
        for i, x in enumerate(X):
            distances = [distance(x, X[m]) for m in medoids]
            cluster_idx = np.argmin(distances)
            clusters[cluster_idx].append(i)
        new_medoids = []
        for cluster in clusters:
####参考教材内容,补全关键代码,重新运行####
            cluster_distances=[np.sum([distance(X[i],X[j]) for j in cluster]) for i in cluster]
            medoid_idx=cluster[np.argmin(cluster_distances)]
            new_medoids.append(medoid_idx)
        if np.array_equal(medoids, new_medoids):
            break
        medoids = new_medoids
    centroids = X[medoids]
    labels = np.zeros(len(X))
    for i, cluster in enumerate(clusters):
        labels[cluster] = i
    return centroids, labels

iris = load_iris()
X = iris.data
y = iris.target
scaler = StandardScaler()
X = scaler.fit_transform(X)
np.random.seed(20)
####参考教材内容,补全关键代码,重新运行####
centroids,labels=kmedoids(X,n_clusters=3,max_iters=100)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, color='red')
plt.scatter(X[:, 0], X[:, 1], c=labels,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Kmedoids')
plt.show()

plt.scatter(X[:, 0], X[:, 1], c=y,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Iris Dataset')
plt.show()

sklearn实现

from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
iris = load_iris()
X = iris.data
scaler = StandardScaler()
X = scaler.fit_transform(X)
####参考教材内容,补全关键代码,重新运行####
kmeans=KMeans(n_clusters=3,random_state=20,n_init=10)
kmeans.fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, color='red')
plt.scatter(X[:, 0], X[:, 1], c=labels,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Kmeans (Sklearn)')
plt.show()

【实例】Seeds数据集简要分析

import pandas as pd
import matplotlib.pyplot as plt

#data = pd.read_csv("data\\ch07聚类\\seeds_dataset.txt", sep="\t", header=None)
data = pd.read_csv("seeds_dataset.txt", sep="\t", header=None)
####参考教材内容,补全关键代码,重新运行####
print(data.head())

####参考教材内容,补全关键代码,重新运行####
print(data.describe())

data.hist(figsize=(7, 4))
plt.tight_layout()
plt.show()

!pip install seaborn

import seaborn as sns
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu")
plt.show()

sns.pairplot(data.iloc[:, 0:4], height=1.2)
plt.show()

小麦种子层次聚类

1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
# 加载数据集
#data = pd.read_csv("data\\ch07聚类\\seeds_dataset.txt",sep="\t", header=None)
data = pd.read_csv("seeds_dataset.txt",sep="\t", header=None)
X = data.iloc[:20, :-1].values  # 提取特征向量
print(X.shape) #(20, 7)
# 使用ward方法进行层次聚类
####参考教材内容,补全关键代码,重新运行####
Z=linkage(X,'ward')
# 绘制树状图
plt.figure(figsize=(12, 6))
#plt.rcParams['font.family'] = 'SimSun'  
####参考教材内容,补全关键代码,重新运行####
dendrograme(Z)
#plt.xlabel('样本索引')
#plt.ylabel('距离')
#plt.title('Seeds数据集层次聚类树状图')
plt.show()
2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
# 加载数据集
#data = pd.read_csv("data\\ch07聚类\\seeds_dataset.txt",sep="\t", header=None)
data = pd.read_csv("seeds_dataset.txt",sep="\t", header=None)
X = data.iloc[:, :-1].values  # 提取特征向量
print(X.shape) #(210, 7)
# 使用ward方法进行层次聚类
Z = linkage(X, 'ward')
# 绘制树状图
plt.figure(figsize=(12, 6))
#plt.rcParams['font.family'] = 'SimSun'
####参考教材内容,补全关键代码,重新运行####
dendrogram(Z,truncate_mode='lastp',p=30,leaf_font_size=12,show_contracted=True)
#plt.xlabel('样本索引')
#plt.ylabel('距离')
#plt.title('Seeds数据集层次聚类树状图')
plt.show()

ai写的

import numpy as np

def linkage(X, method='single'):
    n = X.shape[0]
    distances = np.zeros((n - 1, 4))

    clusters = [[i] for i in range(n)]
    cluster_indices = np.arange(n)

    for k in range(n - 1):
        pairwise_distances = np.zeros((len(clusters), len(clusters)))
        for i in range(len(clusters)):
            for j in range(len(clusters)):
                if i != j:
                    dist = compute_distance(X, clusters[i], clusters[j], method)
                    pairwise_distances[i, j] = dist

        i, j = np.unravel_index(pairwise_distances.argmin(), pairwise_distances.shape)
        new_cluster = clusters[i] + clusters[j]
####参考教材内容,补全关键代码,重新运行####
        d = pairwise_distances[i, j]
        label_i = cluster_indices[clusters[i][0]]
        label_j = cluster_indices[clusters[j][0]]
        new_label = n + k
        for idx in new_cluster:
            cluster_indices[idx] = new_label
        distances[k] = [label_i, label_j, d, len(new_cluster)]
        if i > j:
            del clusters[i]
            del clusters[j]
        else:
            del clusters[j]
            del clusters[i]
        clusters.append(new_cluster)
        for m in range(len(clusters) - 1):
####参考教材内容,补全关键代码,重新运行####
                continue 
    return distances, cluster_indices

def compute_distance(X, cluster1, cluster2, method):
    if method == 'single':
        dist = np.min([np.min(X[i1, :] - X[i2, :]) for i1 in cluster1 for i2 in cluster2])
    elif method == 'complete':
        dist = np.max([np.max(X[i1, :] - X[i2, :]) for i1 in cluster1 for i2 in cluster2])
    elif method == 'average':
        dist = np.mean([np.linalg.norm(X[i1, :] - X[i2, :]) for i1 in cluster1 for i2 in cluster2])
    else:
        raise ValueError("Invalid method specified.")

    return dist

# 示例数据
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])

# 使用single linkage进行层次聚类
####参考教材内容,补全关键代码,重新运行####
distances, cluster_indices = linkage(X, method='single')
print("Distances:")
print(distances)

print("Cluster indices:")
print(cluster_indices)

谱聚类

python实现

import numpy as np
from sklearn.datasets import load_iris
from scipy.linalg import eigh
from sklearn.cluster import KMeans
def kernel_matrix(X, sigma=1):
    """
    计算数据矩阵X的RBF核矩阵
    参数:
    X: 数据矩阵,shape为 (n_samples, n_features)
    sigma: RBF核的带宽参数,默认为1
    返回:
    K: 核矩阵,shape为 (n_samples, n_samples)
    """
    n_samples = X.shape[0]
    K = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        for j in range(i+1, n_samples):
            distance = np.linalg.norm(X[i] - X[j])
####参考教材内容,补全关键代码,重新运行####
            k[i][j]=K[j][i]=np.exp(-distance**2/(2*sigma**2))
    return K
def spectral_clustering(K, n_clusters):
    """
    谱聚类算法
    参数:
    K: 核矩阵,shape为 (n_samples, n_samples)
    n_clusters: 聚类簇的数量
    返回:
    clusters: 聚类结果,shape为 (n_samples,)
    """
    n_samples = K.shape[0]
    # 计算度矩阵
    D = np.diag(np.sum(K, axis=1)) 
    # 计算拉普拉斯矩阵
    L = D - K
    # 计算特征值和特征向量
####参考教材内容,补全关键代码,重新运行####
    eigen_vals,eigen_vecs=eigh(L)
    # 提取前n_clusters个特征向量
    idx = np.argsort(eigen_vals)[:n_clusters]
    eigenvectors = eigen_vecs[:, idx]
####参考教材内容,补全关键代码,重新运行####
    kmeans=KMeans(n_clusters=n_clusters,random_state=42)
    clusters=Kmaens.fit_predict(eigenvectors)
    return clusters
def main():
    iris = load_iris()
    X = iris.data
    y = iris.target
    # 数据标准化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 计算核矩阵
####参考教材内容,补全关键代码,重新运行####
    K=kernel_matrix(X_scaled,sigma=1)
    # 谱聚类
####参考教材内容,补全关键代码,重新运行####
    cluster=spectral_clustering(K,n_clusters=3)
    # 打印每个样本的聚类结果
    for i in range(10):
        print("样本 {}: 真实类别{},聚类结果{}".format(i, y[i], clusters[i]))

#     # 可视化聚类结果(仅适用于二维数据)
#     import matplotlib.pyplot as plt
#     plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters)
#     plt.xlabel('Feature 1')
#     plt.ylabel('Feature 2')
#     plt.title('Spectral Clustering - Iris Dataset')
#     plt.show()
if __name__ == "__main__":
    main()

sklearn实现

1
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from scipy.sparse import csgraph
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target

# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 计算相似度矩阵
####参考教材内容,补全关键代码,重新运行####
similarity = rbf_kernel(X_scaled)
# 计算度矩阵
degree = np.diag(np.sum(similarity, axis=1))

# 计算拉普拉斯矩阵
####参考教材内容,补全关键代码,重新运行####
laplacian = degree - similarity
# 计算拉普拉斯矩阵的特征值和特征向量
####参考教材内容,补全关键代码,重新运行####
eigenvalues, eigenvectors = np.linalg.eigh(laplacian)
# 对特征值进行排序,取最小的k个特征向量
k = 3
indices = np.argsort(eigenvalues)[:k]
####参考教材内容,补全关键代码,重新运行####
eigenvectors_subset = eigenvectors[:, indices]
# 使用KMeans对特征向量进行聚类
####参考教材内容,补全关键代码,重新运行####
kmeans = KMeans(n_clusters=k, random_state=0)
clusters = kmeans.fit_predict(eigenvectors_subset)
# 打印每个样本的聚类结果
for i in range(10):
    print("样本 {}: 真实类别{},聚类结果{}".format(i, y[i], clusters[i]))

# plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters)
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.title('Spectral Clustering - Iris Dataset')
# plt.show()
2
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering
import matplotlib.pyplot as plt

# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y=iris.target

# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 初始化谱聚类算法
n_clusters = 3  # 设置聚类簇数
####参考教材内容,补全关键代码,重新运行####
spectral_clustering=SpectralClustering(n_clusters=n_clusters,random_state=42)
# 聚类
####参考教材内容,补全关键代码,重新运行####
clusters=spectral_clustering.fit_predict(X_scaled)
# # 打印每个样本的聚类结果
# for i in range(len(X)):
#     print("样本 {} 类别{},聚类结果:{}".format(i, y[i],clusters[i]))

plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Spectral Clustering - Iris Dataset')
plt.show()

代表性的聚类算法性能比较

import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
#生成5个数据集
data1 = datasets.make_blobs(n_samples=500, random_state=30)
data2 = datasets.make_blobs(n_samples=500, cluster_std=[1.0, 2.5, 0.5], random_state=30)
X3, y3 = datasets.make_blobs(n_samples=500, random_state=30)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X3 = np.dot(X3, transformation)
data3 = (X3, y3)
data4 = datasets.make_circles(n_samples=500, factor=0.5, noise=0.05, random_state=30)
data5 = datasets.make_moons(n_samples=500, noise=0.05, random_state=30)
datasets=[data1, data2, data3, data4, data5]
plt.figure(figsize=(20, 12))
plt.subplots_adjust(wspace=0.02, hspace=0.02)
plot_num = 1
for i_dataset, dataset in enumerate(datasets): 
    X, y = dataset
    nC=len(np.unique(y)) #类别数量
    X = StandardScaler().fit_transform(X)
    #创建6个不同类型的聚类模型
####参考教材内容,补全关键代码,重新运行####
    bandwidth=cluster.estimate_bandwidth(X,quantile=0.2)
    ms=cluster.MeanShift(bandwidth=bandwidth,bin_seeding=True)
    KMeans=cluster.KMeans(n_clusters=nC,random_state=20,n_init="auto")
    spectral=cluster.SpectralClustering(n_cluster=nC,eigen_solver="arpack",affinity="nearest_neighbors",random_state=20,)
    dbscan=cluster.DBSCAN(eps=0.3)
    birch=cluster.Birch(n_clusters=nC)
    gmm=mixture.GaussianMixture(n_components=nC,covariance_type="full",random_state=20,)
    clustering_algorithms = (
        ("KMeans", KMeans),
        ("BIRCH", birch), 
        ("DBSCAN", dbscan),
        ("MeanShift", ms),
        ("Gaussian Mixture", gmm),
        ("Spectral Clustering", spectral),)
    for name, algorithm in clustering_algorithms:
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
####参考教材内容,补全关键代码,重新运行####  
        algorithm.fit(X)
        if hasattr(algorithm, "labels_"):
            y_pred = algorithm.labels_.astype(int)
        else:
####参考教材内容,补全关键代码,重新运行####
            y_pred=algorithm.predict(X)
        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)
        colors = np.array(["#ff7f00","#984ea3","#377eb8","#4daf4a","#999999","#000000"])      
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plot_num += 1
plt.show()