K-means算法
K-means算法是一种无监督算法,需要首先确定将要分成的聚类数k,随机选k个点(称为聚类点),样本点分配给离聚类点最近的那个聚类,然后每个聚类的mean设为新的聚类的点,一直更新直到损失(可以通过平方损失,每个样本点到聚类点距离的平方)变化不大。按照此思路实现的代码如下。
import numpy as np
import pandas as pd
import random
import torch
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from dfs import Graph
X = pd.read_excel(io=r'./data.xlsx', sheet_name=2,engine='openpyxl')
X = X.iloc[:, [2, 4, 5]]
X = X.values
J=[]
Max = []
class K_Means(object):
def __init__(self, k=2, max_iter=300):
self.k_ = k
self.max_iter_ = max_iter
def fit(self, data):
self.centers_ = {}
arr = sorted([i for i in range(data.shape[0])], key = lambda x: random.random())
print(arr)
J = np.zeros(len(arr))
Max = np.zeros(len(arr))
for i in range(self.k_):
self.centers_[i] = data[arr[i], 1:]
for i in range(self.max_iter_):
self.clf_ = {}
for j in range(self.k_):
self.clf_[j] = []
for feature in data:
distances = [np.linalg.norm(feature[1:] - self.centers_[center]) * feature[0] for center in
self.centers_]
classification = distances.index(min(distances))
#print(min(distances))
J[i] += min(distances)
Max[i] = max(Max[i], min(np.linalg.norm(feature[1:] - self.centers_[center]) for center in
self.centers_))
self.clf_[classification].append(feature)
for c in self.clf_:
tmp = np.array(self.clf_[c])
if tmp.size > 0:
self.centers_[c] = np.sum(tmp[:, 0:1] / np.sum(tmp[:, 0]) * tmp[:, 1:], axis=0)
print(f"第{i}次损失{J[i]},最大距离{Max[i]},结果{F.getCenters()}")
def getCenters(self):
return self.centers_
def getclf(self):
return self.clf_
F = K_Means(450, 10)
F.fit(X)
ans = F.getCenters()
clf = F.getclf()
f = open("ans1.txt", "w")
print(ans,'\n',F.getclf())
for k,v in ans.items():
f.write(f"{k} {v[0]} {v[1]}\n")
# pass
f.close()
f2 = open("anslu.txt", "w")
print(ans[0])
print(clf[0][1:],len(clf[0][1:]), clf[0][1:])
for i in range(450):
G = Graph(ans[i], clf[i][1:])
f2.write(f"{G.dfs()[1]}\n")
实现的过程中想到了一些问题。如果随机初始化聚类点,这时存在一个点离样本点都特别远,可能不会被更新。如果随机选取k个样本点的话,极端情况下会出现存在聚类点附着在样本点上不更新。还有如果聚类点比较多,会有多个极值点,不容易找到最优解。这些问题可以通过修改K的方式,来是结果趋向于较好的分类。