K-means算法

K-means算法是一种无监督算法,需要首先确定将要分成的聚类数k,随机选k个点(称为聚类点),样本点分配给离聚类点最近的那个聚类,然后每个聚类的mean设为新的聚类的点,一直更新直到损失(可以通过平方损失,每个样本点到聚类点距离的平方)变化不大。按照此思路实现的代码如下。

import numpy as np
import pandas as pd
import random
import torch
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from dfs import Graph

X = pd.read_excel(io=r'./data.xlsx', sheet_name=2,engine='openpyxl')
X = X.iloc[:, [2, 4, 5]]
X = X.values
J=[]
Max = []



class K_Means(object):
    def __init__(self, k=2, max_iter=300):
        self.k_ = k
        self.max_iter_ = max_iter

    def fit(self, data):
        self.centers_ = {}
        arr = sorted([i for i in range(data.shape[0])], key = lambda x: random.random())
        print(arr)
        J = np.zeros(len(arr))
        Max = np.zeros(len(arr))
        for i in range(self.k_):
            self.centers_[i] = data[arr[i], 1:]

        for i in range(self.max_iter_):
            self.clf_ = {}
            for j in range(self.k_):
                self.clf_[j] = []
            for feature in data:
                distances = [np.linalg.norm(feature[1:] - self.centers_[center]) * feature[0] for center in
                             self.centers_]
                classification = distances.index(min(distances))
                #print(min(distances))
                J[i] += min(distances)
                Max[i] = max(Max[i], min(np.linalg.norm(feature[1:] - self.centers_[center]) for center in
                             self.centers_))
                self.clf_[classification].append(feature)

            for c in self.clf_:
                tmp = np.array(self.clf_[c])
                
                if tmp.size > 0:
                    self.centers_[c] = np.sum(tmp[:, 0:1] / np.sum(tmp[:, 0]) * tmp[:, 1:], axis=0)
            print(f"第{i}次损失{J[i]},最大距离{Max[i]},结果{F.getCenters()}")

    def getCenters(self):
        return self.centers_

    def getclf(self):
        return self.clf_

F = K_Means(450, 10)
F.fit(X)
ans = F.getCenters()
clf = F.getclf()
f = open("ans1.txt", "w")
print(ans,'\n',F.getclf())
for k,v in ans.items():
    f.write(f"{k} {v[0]} {v[1]}\n")
#    pass
f.close()
f2 = open("anslu.txt", "w")
print(ans[0])
print(clf[0][1:],len(clf[0][1:]), clf[0][1:])
for i in range(450):
    G = Graph(ans[i], clf[i][1:])
    f2.write(f"{G.dfs()[1]}\n")


实现的过程中想到了一些问题。如果随机初始化聚类点,这时存在一个点离样本点都特别远,可能不会被更新。如果随机选取k个样本点的话,极端情况下会出现存在聚类点附着在样本点上不更新。还有如果聚类点比较多,会有多个极值点,不容易找到最优解。这些问题可以通过修改K的方式,来是结果趋向于较好的分类。

posted @ 2022-10-14 00:14  孑然520  阅读(89)  评论(0)    收藏  举报