K-means load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
scaler = StandardScaler()
X = scaler.fit_transform(X)
np.random.seed(20)
def kmeans(X, n_clusters, max_iters=100):
# 初始化聚类中心
####参考教材内容,补全关键代码,重新运行####
for i in range(max_iters):
# 计算每个样本到聚类中心的距离
distances = np.linalg.norm(X[:, None] - centroids, axis=2)
# 分配样本到最近的聚类中心
####参考教材内容,补全关键代码,重新运行####
labels=np.argmin(distances,axis=1)
# 计算新的聚类中心
####参考教材内容,补全关键代码,重新运行####
new_centroids=np.array([X[labels==j].mean(axis=0) for j in range(n_clusters)])
# 判断是否收敛
if np.allclose(new_centroids, centroids):
break
centroids = new_centroids
return centroids, labels
####参考教材内容,补全关键代码,重新运行####
centroids,labels=kmeans(X,n_clusters=3,max_iters=100)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, color='red')
plt.scatter(X[:, 0], X[:, 1], c=labels,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Kmeans')
plt.show()
plt.scatter(X[:, 0], X[:, 1], c=y,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Iris Dataset')
plt.show()
kmedoids load_iris
python实现
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
def distance(x1, x2):
return np.sum(np.abs(x1 - x2)) #manhattan distance
#return np.linalg.norm(x1 - x2)
def kmedoids(X, n_clusters, max_iters=100):
medoids = np.random.choice(len(X), n_clusters, replace=False)
for _ in range(max_iters):
clusters = [[] for _ in range(n_clusters)]
for i, x in enumerate(X):
distances = [distance(x, X[m]) for m in medoids]
cluster_idx = np.argmin(distances)
clusters[cluster_idx].append(i)
new_medoids = []
for cluster in clusters:
####参考教材内容,补全关键代码,重新运行####
cluster_distances=[np.sum([distance(X[i],X[j]) for j in cluster]) for i in cluster]
medoid_idx=cluster[np.argmin(cluster_distances)]
new_medoids.append(medoid_idx)
if np.array_equal(medoids, new_medoids):
break
medoids = new_medoids
centroids = X[medoids]
labels = np.zeros(len(X))
for i, cluster in enumerate(clusters):
labels[cluster] = i
return centroids, labels
iris = load_iris()
X = iris.data
y = iris.target
scaler = StandardScaler()
X = scaler.fit_transform(X)
np.random.seed(20)
####参考教材内容,补全关键代码,重新运行####
centroids,labels=kmedoids(X,n_clusters=3,max_iters=100)
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, color='red')
plt.scatter(X[:, 0], X[:, 1], c=labels,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Kmedoids')
plt.show()
plt.scatter(X[:, 0], X[:, 1], c=y,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Iris Dataset')
plt.show()
sklearn实现
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
iris = load_iris()
X = iris.data
scaler = StandardScaler()
X = scaler.fit_transform(X)
####参考教材内容,补全关键代码,重新运行####
kmeans=KMeans(n_clusters=3,random_state=20,n_init=10)
kmeans.fit(X)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], marker='*', s=200, color='red')
plt.scatter(X[:, 0], X[:, 1], c=labels,s=10)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Kmeans (Sklearn)')
plt.show()
【实例】Seeds数据集简要分析
import pandas as pd
import matplotlib.pyplot as plt
#data = pd.read_csv("data\\ch07聚类\\seeds_dataset.txt", sep="\t", header=None)
data = pd.read_csv("seeds_dataset.txt", sep="\t", header=None)
####参考教材内容,补全关键代码,重新运行####
print(data.head())
####参考教材内容,补全关键代码,重新运行####
print(data.describe())
data.hist(figsize=(7, 4))
plt.tight_layout()
plt.show()
!pip install seaborn
import seaborn as sns
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap="YlGnBu")
plt.show()
sns.pairplot(data.iloc[:, 0:4], height=1.2)
plt.show()
小麦种子层次聚类
1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
# 加载数据集
#data = pd.read_csv("data\\ch07聚类\\seeds_dataset.txt",sep="\t", header=None)
data = pd.read_csv("seeds_dataset.txt",sep="\t", header=None)
X = data.iloc[:20, :-1].values # 提取特征向量
print(X.shape) #(20, 7)
# 使用ward方法进行层次聚类
####参考教材内容,补全关键代码,重新运行####
Z=linkage(X,'ward')
# 绘制树状图
plt.figure(figsize=(12, 6))
#plt.rcParams['font.family'] = 'SimSun'
####参考教材内容,补全关键代码,重新运行####
dendrograme(Z)
#plt.xlabel('样本索引')
#plt.ylabel('距离')
#plt.title('Seeds数据集层次聚类树状图')
plt.show()
2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
# 加载数据集
#data = pd.read_csv("data\\ch07聚类\\seeds_dataset.txt",sep="\t", header=None)
data = pd.read_csv("seeds_dataset.txt",sep="\t", header=None)
X = data.iloc[:, :-1].values # 提取特征向量
print(X.shape) #(210, 7)
# 使用ward方法进行层次聚类
Z = linkage(X, 'ward')
# 绘制树状图
plt.figure(figsize=(12, 6))
#plt.rcParams['font.family'] = 'SimSun'
####参考教材内容,补全关键代码,重新运行####
dendrogram(Z,truncate_mode='lastp',p=30,leaf_font_size=12,show_contracted=True)
#plt.xlabel('样本索引')
#plt.ylabel('距离')
#plt.title('Seeds数据集层次聚类树状图')
plt.show()
ai写的
import numpy as np
def linkage(X, method='single'):
n = X.shape[0]
distances = np.zeros((n - 1, 4))
clusters = [[i] for i in range(n)]
cluster_indices = np.arange(n)
for k in range(n - 1):
pairwise_distances = np.zeros((len(clusters), len(clusters)))
for i in range(len(clusters)):
for j in range(len(clusters)):
if i != j:
dist = compute_distance(X, clusters[i], clusters[j], method)
pairwise_distances[i, j] = dist
i, j = np.unravel_index(pairwise_distances.argmin(), pairwise_distances.shape)
new_cluster = clusters[i] + clusters[j]
####参考教材内容,补全关键代码,重新运行####
d = pairwise_distances[i, j]
label_i = cluster_indices[clusters[i][0]]
label_j = cluster_indices[clusters[j][0]]
new_label = n + k
for idx in new_cluster:
cluster_indices[idx] = new_label
distances[k] = [label_i, label_j, d, len(new_cluster)]
if i > j:
del clusters[i]
del clusters[j]
else:
del clusters[j]
del clusters[i]
clusters.append(new_cluster)
for m in range(len(clusters) - 1):
####参考教材内容,补全关键代码,重新运行####
continue
return distances, cluster_indices
def compute_distance(X, cluster1, cluster2, method):
if method == 'single':
dist = np.min([np.min(X[i1, :] - X[i2, :]) for i1 in cluster1 for i2 in cluster2])
elif method == 'complete':
dist = np.max([np.max(X[i1, :] - X[i2, :]) for i1 in cluster1 for i2 in cluster2])
elif method == 'average':
dist = np.mean([np.linalg.norm(X[i1, :] - X[i2, :]) for i1 in cluster1 for i2 in cluster2])
else:
raise ValueError("Invalid method specified.")
return dist
# 示例数据
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
# 使用single linkage进行层次聚类
####参考教材内容,补全关键代码,重新运行####
distances, cluster_indices = linkage(X, method='single')
print("Distances:")
print(distances)
print("Cluster indices:")
print(cluster_indices)
谱聚类
python实现
import numpy as np
from sklearn.datasets import load_iris
from scipy.linalg import eigh
from sklearn.cluster import KMeans
def kernel_matrix(X, sigma=1):
"""
计算数据矩阵X的RBF核矩阵
参数:
X: 数据矩阵,shape为 (n_samples, n_features)
sigma: RBF核的带宽参数,默认为1
返回:
K: 核矩阵,shape为 (n_samples, n_samples)
"""
n_samples = X.shape[0]
K = np.zeros((n_samples, n_samples))
for i in range(n_samples):
for j in range(i+1, n_samples):
distance = np.linalg.norm(X[i] - X[j])
####参考教材内容,补全关键代码,重新运行####
k[i][j]=K[j][i]=np.exp(-distance**2/(2*sigma**2))
return K
def spectral_clustering(K, n_clusters):
"""
谱聚类算法
参数:
K: 核矩阵,shape为 (n_samples, n_samples)
n_clusters: 聚类簇的数量
返回:
clusters: 聚类结果,shape为 (n_samples,)
"""
n_samples = K.shape[0]
# 计算度矩阵
D = np.diag(np.sum(K, axis=1))
# 计算拉普拉斯矩阵
L = D - K
# 计算特征值和特征向量
####参考教材内容,补全关键代码,重新运行####
eigen_vals,eigen_vecs=eigh(L)
# 提取前n_clusters个特征向量
idx = np.argsort(eigen_vals)[:n_clusters]
eigenvectors = eigen_vecs[:, idx]
####参考教材内容,补全关键代码,重新运行####
kmeans=KMeans(n_clusters=n_clusters,random_state=42)
clusters=Kmaens.fit_predict(eigenvectors)
return clusters
def main():
iris = load_iris()
X = iris.data
y = iris.target
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 计算核矩阵
####参考教材内容,补全关键代码,重新运行####
K=kernel_matrix(X_scaled,sigma=1)
# 谱聚类
####参考教材内容,补全关键代码,重新运行####
cluster=spectral_clustering(K,n_clusters=3)
# 打印每个样本的聚类结果
for i in range(10):
print("样本 {}: 真实类别{},聚类结果{}".format(i, y[i], clusters[i]))
# # 可视化聚类结果(仅适用于二维数据)
# import matplotlib.pyplot as plt
# plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters)
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.title('Spectral Clustering - Iris Dataset')
# plt.show()
if __name__ == "__main__":
main()
sklearn实现
1
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from scipy.sparse import csgraph
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 计算相似度矩阵
####参考教材内容,补全关键代码,重新运行####
similarity = rbf_kernel(X_scaled)
# 计算度矩阵
degree = np.diag(np.sum(similarity, axis=1))
# 计算拉普拉斯矩阵
####参考教材内容,补全关键代码,重新运行####
laplacian = degree - similarity
# 计算拉普拉斯矩阵的特征值和特征向量
####参考教材内容,补全关键代码,重新运行####
eigenvalues, eigenvectors = np.linalg.eigh(laplacian)
# 对特征值进行排序,取最小的k个特征向量
k = 3
indices = np.argsort(eigenvalues)[:k]
####参考教材内容,补全关键代码,重新运行####
eigenvectors_subset = eigenvectors[:, indices]
# 使用KMeans对特征向量进行聚类
####参考教材内容,补全关键代码,重新运行####
kmeans = KMeans(n_clusters=k, random_state=0)
clusters = kmeans.fit_predict(eigenvectors_subset)
# 打印每个样本的聚类结果
for i in range(10):
print("样本 {}: 真实类别{},聚类结果{}".format(i, y[i], clusters[i]))
# plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters)
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.title('Spectral Clustering - Iris Dataset')
# plt.show()
2
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering
import matplotlib.pyplot as plt
# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y=iris.target
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 初始化谱聚类算法
n_clusters = 3 # 设置聚类簇数
####参考教材内容,补全关键代码,重新运行####
spectral_clustering=SpectralClustering(n_clusters=n_clusters,random_state=42)
# 聚类
####参考教材内容,补全关键代码,重新运行####
clusters=spectral_clustering.fit_predict(X_scaled)
# # 打印每个样本的聚类结果
# for i in range(len(X)):
# print("样本 {} 类别{},聚类结果:{}".format(i, y[i],clusters[i]))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Spectral Clustering - Iris Dataset')
plt.show()
代表性的聚类算法性能比较
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
#生成5个数据集
data1 = datasets.make_blobs(n_samples=500, random_state=30)
data2 = datasets.make_blobs(n_samples=500, cluster_std=[1.0, 2.5, 0.5], random_state=30)
X3, y3 = datasets.make_blobs(n_samples=500, random_state=30)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X3 = np.dot(X3, transformation)
data3 = (X3, y3)
data4 = datasets.make_circles(n_samples=500, factor=0.5, noise=0.05, random_state=30)
data5 = datasets.make_moons(n_samples=500, noise=0.05, random_state=30)
datasets=[data1, data2, data3, data4, data5]
plt.figure(figsize=(20, 12))
plt.subplots_adjust(wspace=0.02, hspace=0.02)
plot_num = 1
for i_dataset, dataset in enumerate(datasets):
X, y = dataset
nC=len(np.unique(y)) #类别数量
X = StandardScaler().fit_transform(X)
#创建6个不同类型的聚类模型
####参考教材内容,补全关键代码,重新运行####
bandwidth=cluster.estimate_bandwidth(X,quantile=0.2)
ms=cluster.MeanShift(bandwidth=bandwidth,bin_seeding=True)
KMeans=cluster.KMeans(n_clusters=nC,random_state=20,n_init="auto")
spectral=cluster.SpectralClustering(n_cluster=nC,eigen_solver="arpack",affinity="nearest_neighbors",random_state=20,)
dbscan=cluster.DBSCAN(eps=0.3)
birch=cluster.Birch(n_clusters=nC)
gmm=mixture.GaussianMixture(n_components=nC,covariance_type="full",random_state=20,)
clustering_algorithms = (
("KMeans", KMeans),
("BIRCH", birch),
("DBSCAN", dbscan),
("MeanShift", ms),
("Gaussian Mixture", gmm),
("Spectral Clustering", spectral),)
for name, algorithm in clustering_algorithms:
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
####参考教材内容,补全关键代码,重新运行####
algorithm.fit(X)
if hasattr(algorithm, "labels_"):
y_pred = algorithm.labels_.astype(int)
else:
####参考教材内容,补全关键代码,重新运行####
y_pred=algorithm.predict(X)
plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
colors = np.array(["#ff7f00","#984ea3","#377eb8","#4daf4a","#999999","#000000"])
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plot_num += 1
plt.show()