python kmeans 无监督聚类,AUTO无监督聚类,KMeans图论联通图解法最小生成树,社交网络,one-hot编码进行无监督聚类
import os
from collections import Counter
import numpy as np
import pandas as pd
class autoKMeans:
"""K均值聚类算法实现类"""
def __init__(self, n_clusters=3, max_iter=300,Similarity = 0.3,kmeansType = "autoKMeans"):
"""
初始化KMeans参数
:param n_clusters: 聚类中心数量,默认3
:param max_iter: 最大迭代次数,默认300
"""
self.n_clusters = n_clusters
self.max_iter = max_iter
self.centroids = None # 聚类中心点
self.labels = None # 样本所属簇标签
self.kmeansType = kmeansType # autoKMeans / accKMeans
self.Similarity = 1-Similarity # 最低划分相似度,默认值0.5,越大类别越多划分越精确,反之。
def _count_common_ones(self,vec1, vec2):
return np.count_nonzero((vec1 == 1) & (vec2 == 1))
def _euclidean_distance(self, x, y):
"""
计算两个向量之间的欧氏距离
:param a: 向量a
:param b: 向量b
:return: 欧氏距离值
"""
# return np.sqrt(np.sum((a - b) ** 2))
n1 = np.count_nonzero(x == 1)
n2 = np.count_nonzero(y == 1)
n = n1 if n1 > n2 else n2
r = self._count_common_ones(x, y)
#return 99 if r / n == 0 else 1- r / n
return 1- (r/n)
def _initialize_centroids(self, X:np.ndarray):
"""
随机初始化聚类中心点
:param X: 输入数据矩阵
:return: 初始中心点数组
"""
#indices = random.sample(range(len(X)), self.n_clusters)
contens = []
n,m = X.shape
minsum = 0.0
ind = 0
for i in range(n):
if np.sum(X[i,:])>minsum:
minsum = np.sum(X[i,:])
ind = i
contens.append(X[ind,:])
for k in range(1,self.n_clusters):
minDist = []
for i in range(n):
md = np.inf
for j in range(contens.__len__()):
dist = self._euclidean_distance(X[i,:],contens[j])
if dist<md:
md = dist
minDist.append(md)
max_index = np.argmax(np.array(minDist)) # 返回2
contens.append(X[max_index, :])
if minDist[max_index] < self.Similarity and minDist[max_index] != np.inf and self.kmeansType == "autoKMeans" :
self.n_clusters = contens.__len__()
break
return contens
def _assign_clusters(self, X):
"""
将每个样本分配到最近的中心点
:param X: 输入数据矩阵
:return: 每个样本对应的簇标签数组
"""
# 计算每个样本到各中心点的距离矩阵
distances = np.zeros((len(X), self.n_clusters))
for i in range(self.n_clusters):
distances[:, i] = np.array(
[self._euclidean_distance(x, self.centroids[i]) for x in X]
)
# 返回最小距离对应的簇索引
return np.argmin(distances, axis=1)
def _update_centroids(self, X, labels):
"""
根据当前簇分配更新中心点位置
:param X: 输入数据矩阵
:param labels: 当前簇标签
:return: 更新后的中心点数组
"""
new_centroids = np.zeros((self.n_clusters, X.shape[1]))
for i in range(self.n_clusters):
cluster_points = X[labels == i]
# 处理空簇情况
if len(cluster_points) > 0:
# new_centroids[i] = np.mean(cluster_points, axis=0) # 需要调整不能使用均值进行聚类*****************
ctids = np.sum(cluster_points,axis=0)
ctids[ctids > 0] = 1
new_centroids[i] = ctids
else:
new_centroids[i] = self.centroids[i] # 保持原中心点
return new_centroids
def fit(self, X):
"""
训练KMeans模型
:param X: 输入数据矩阵(n_samples, n_features)
"""
# 1. 初始化中心点
if self.kmeansType == "autoKMeans":
self.n_clusters = X.shape[0]
self.centroids = self._initialize_centroids(X)
# 2. 迭代优化
for _ in range(self.max_iter):
old_centroids = self.centroids.copy()
# a. 分配簇标签
self.labels = self._assign_clusters(X)
# b. 更新中心点
self.centroids = self._update_centroids(X, self.labels)
# c. 检查收敛条件
if np.allclose(old_centroids, self.centroids):
break
def predict(self, X):
"""
预测新样本的簇归属
:param X: 新数据矩阵
:return: 预测的簇标签数组
"""
return self._assign_clusters(X)
# def visualize_clusters(X, labels, centroids):
# """可视化聚类结果(需安装matplotlib)"""
# import matplotlib.pyplot as plt
# plt.figure(figsize=(8, 6))
# colors = ['r', 'g', 'b', 'y', 'c', 'm']
# for i in range(len(centroids)):
# cluster_points = X[labels == i]
# plt.scatter(cluster_points[:, 0], cluster_points[:, 1],
# c=colors[i], label=f'Cluster {i + 1}')
# plt.scatter(centroids[i][0], centroids[i][1],
# c='k', marker='x', s=100)
# plt.legend()
# plt.title('KMeans Clustering Result')
# plt.show()
if __name__ == "__main__":
# 测试数据生成
# np.random.seed(42)
# cluster1 = np.random.normal(loc=[0, 0], scale=1, size=(100, 2))
# cluster2 = np.random.normal(loc=[5, 5], scale=1, size=(100, 2))
# cluster3 = np.random.normal(loc=[-5, 5], scale=1, size=(100, 2))
# X:np.ndarray = np.vstack([cluster1, cluster2, cluster3])
data = pd.read_csv('./data/T3H_ALTERT_CUST_2025.CSV',header=0,index_col=0)
print(data.shape)
# (131867, 143)
X:np.ndarray = data.to_numpy()[:1000,:]
# 模型训练
kmeans = autoKMeans(n_clusters=3,max_iter=1000,kmeansType="autoKMeans")
kmeans.fit(X)
r = kmeans.predict(X)
cnt = dict(Counter(list(r)).most_common())
rlt = pd.DataFrame(r,columns=["lable"])
print(cnt)
print(rlt)
path = './data/result.csv'
if os.path.exists(path):
os.remove(path)
rlt.to_csv(path)
path_x = './data/x.csv'
if os.path.exists(path_x):
os.remove(path_x)
pd.DataFrame(X,columns=data.columns).to_csv(path_x)
# {np.int64(2): 807, np.int64(0): 135, np.int64(5): 35, np.int64(1): 12, np.int64(4): 6, np.int64(3): 5}
# {np.int64(2): 807, np.int64(0): 135, np.int64(5): 35, np.int64(1): 12, np.int64(4): 6, np.int64(3): 5}
# {np.int64(2): 807, np.int64(0): 135, np.int64(5): 35, np.int64(1): 12, np.int64(4): 6, np.int64(3): 5}
# 结果可视化
# visualize_clusters(X, kmeans.labels, kmeans.centroids)
自动化学习。

浙公网安备 33010602011771号