python kmeans 无监督聚类,AUTO无监督聚类,KMeans图论联通图解法最小生成树,社交网络,one-hot编码进行无监督聚类

import os
from collections import Counter
import numpy as np
import pandas as pd

class autoKMeans:
    """K均值聚类算法实现类"""

    def __init__(self, n_clusters=3, max_iter=300,Similarity = 0.3,kmeansType = "autoKMeans"):
        """
        初始化KMeans参数
        :param n_clusters: 聚类中心数量,默认3
        :param max_iter: 最大迭代次数,默认300
        """
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.centroids = None  # 聚类中心点
        self.labels = None  # 样本所属簇标签
        self.kmeansType = kmeansType  # autoKMeans / accKMeans
        self.Similarity = 1-Similarity # 最低划分相似度,默认值0.5,越大类别越多划分越精确,反之。

    def _count_common_ones(self,vec1, vec2):
        return np.count_nonzero((vec1 == 1) & (vec2 == 1))

    def _euclidean_distance(self, x, y):
        """
        计算两个向量之间的欧氏距离
        :param a: 向量a
        :param b: 向量b
        :return: 欧氏距离值
        """
        # return np.sqrt(np.sum((a - b) ** 2))

        n1 = np.count_nonzero(x == 1)
        n2 = np.count_nonzero(y == 1)
        n = n1 if n1 > n2 else n2
        r = self._count_common_ones(x, y)
        #return 99 if r / n == 0 else 1- r / n
        return 1- (r/n)


    def _initialize_centroids(self, X:np.ndarray):
        """
        随机初始化聚类中心点
        :param X: 输入数据矩阵
        :return: 初始中心点数组
        """
        #indices = random.sample(range(len(X)), self.n_clusters)
        contens = []
        n,m = X.shape
        minsum = 0.0
        ind = 0
        for i in range(n):
            if np.sum(X[i,:])>minsum:
                minsum = np.sum(X[i,:])
                ind = i

        contens.append(X[ind,:])



        for k in range(1,self.n_clusters):
            minDist = []
            for i in range(n):
                md = np.inf
                for j in range(contens.__len__()):
                    dist = self._euclidean_distance(X[i,:],contens[j])
                    if dist<md:
                        md = dist
                minDist.append(md)

            max_index = np.argmax(np.array(minDist))  # 返回2

            contens.append(X[max_index, :])

            if minDist[max_index] < self.Similarity  and minDist[max_index] != np.inf and self.kmeansType == "autoKMeans" :
                self.n_clusters = contens.__len__()
                break

        return contens

    def _assign_clusters(self, X):
        """
        将每个样本分配到最近的中心点
        :param X: 输入数据矩阵
        :return: 每个样本对应的簇标签数组
        """
        # 计算每个样本到各中心点的距离矩阵
        distances = np.zeros((len(X), self.n_clusters))
        for i in range(self.n_clusters):
            distances[:, i] = np.array(
                [self._euclidean_distance(x, self.centroids[i]) for x in X]
            )
        # 返回最小距离对应的簇索引
        return np.argmin(distances, axis=1)

    def _update_centroids(self, X, labels):
        """
        根据当前簇分配更新中心点位置
        :param X: 输入数据矩阵
        :param labels: 当前簇标签
        :return: 更新后的中心点数组
        """
        new_centroids = np.zeros((self.n_clusters, X.shape[1]))
        for i in range(self.n_clusters):
            cluster_points = X[labels == i]
            # 处理空簇情况
            if len(cluster_points) > 0:
                # new_centroids[i] = np.mean(cluster_points, axis=0)  # 需要调整不能使用均值进行聚类*****************
                ctids = np.sum(cluster_points,axis=0)
                ctids[ctids > 0] = 1
                new_centroids[i] = ctids
            else:
                new_centroids[i] = self.centroids[i]  # 保持原中心点
        return new_centroids

    def fit(self, X):
        """
        训练KMeans模型
        :param X: 输入数据矩阵(n_samples, n_features)
        """
        # 1. 初始化中心点
        if self.kmeansType == "autoKMeans":
            self.n_clusters = X.shape[0]

        self.centroids = self._initialize_centroids(X)

        # 2. 迭代优化
        for _ in range(self.max_iter):
            old_centroids = self.centroids.copy()

            # a. 分配簇标签
            self.labels = self._assign_clusters(X)

            # b. 更新中心点
            self.centroids = self._update_centroids(X, self.labels)

            # c. 检查收敛条件
            if np.allclose(old_centroids, self.centroids):
                break

    def predict(self, X):
        """
        预测新样本的簇归属
        :param X: 新数据矩阵
        :return: 预测的簇标签数组
        """
        return self._assign_clusters(X)


# def visualize_clusters(X, labels, centroids):
#     """可视化聚类结果(需安装matplotlib)"""
#     import matplotlib.pyplot as plt
#     plt.figure(figsize=(8, 6))
#     colors = ['r', 'g', 'b', 'y', 'c', 'm']
#     for i in range(len(centroids)):
#         cluster_points = X[labels == i]
#         plt.scatter(cluster_points[:, 0], cluster_points[:, 1],
#                     c=colors[i], label=f'Cluster {i + 1}')
#         plt.scatter(centroids[i][0], centroids[i][1],
#                     c='k', marker='x', s=100)
#     plt.legend()
#     plt.title('KMeans Clustering Result')
#     plt.show()


if __name__ == "__main__":
    # 测试数据生成
    # np.random.seed(42)
    # cluster1 = np.random.normal(loc=[0, 0], scale=1, size=(100, 2))
    # cluster2 = np.random.normal(loc=[5, 5], scale=1, size=(100, 2))
    # cluster3 = np.random.normal(loc=[-5, 5], scale=1, size=(100, 2))
    # X:np.ndarray = np.vstack([cluster1, cluster2, cluster3])


    data = pd.read_csv('./data/T3H_ALTERT_CUST_2025.CSV',header=0,index_col=0)
    print(data.shape)
    # (131867, 143)
    X:np.ndarray = data.to_numpy()[:1000,:]


    # 模型训练
    kmeans = autoKMeans(n_clusters=3,max_iter=1000,kmeansType="autoKMeans")
    kmeans.fit(X)
    r = kmeans.predict(X)

    cnt = dict(Counter(list(r)).most_common())

    rlt = pd.DataFrame(r,columns=["lable"])

    print(cnt)
    print(rlt)

    path = './data/result.csv'
    if os.path.exists(path):
        os.remove(path)

    rlt.to_csv(path)

    path_x = './data/x.csv'
    if os.path.exists(path_x):
        os.remove(path_x)
    pd.DataFrame(X,columns=data.columns).to_csv(path_x)

    # {np.int64(2): 807, np.int64(0): 135, np.int64(5): 35, np.int64(1): 12, np.int64(4): 6, np.int64(3): 5}
    # {np.int64(2): 807, np.int64(0): 135, np.int64(5): 35, np.int64(1): 12, np.int64(4): 6, np.int64(3): 5}
    # {np.int64(2): 807, np.int64(0): 135, np.int64(5): 35, np.int64(1): 12, np.int64(4): 6, np.int64(3): 5}
    # 结果可视化
    # visualize_clusters(X, kmeans.labels, kmeans.centroids)
posted @ 2025-08-20 22:54  ARYOUOK  阅读(10)  评论(0)    收藏  举报