ML实战:手动实现Kmeans算法
代码实现
Kmeans类
import numpy as np
import random
np.set_printoptions(suppress=True)
class Kmeans:
def __init__(self,x,cluster=2):
'''
:param x:训练集
:param cluster:聚类的数量
:param uc:每个样本对于的簇的索引
:param u:簇心的向量
'''
self.x=x
self.cluster=cluster
u=[]
self.uc=[-1]*len(x)
randomlist = random.sample(range(1,len(x)), cluster)
for item in randomlist:
u.append(x[item,:])
self.u = np.array(u)
def find_uci(self,i):
#寻找离样本xi最近的簇心,映射到uc中
temp=self.x[i]-self.u
dis=np.linalg.norm(temp,axis=1,keepdims=True)
self.uc[i]=np.argmin(dis, axis=0)[0]
def single_iter(self):
#单次迭代,找到本次最近的簇心,并更新簇心
for i in range(len(self.x)):
self.find_uci(i)
self.update_u()
def update_u(self):
#更新簇心
tempu=[[0]*len(self.x[0,:]) for _ in range(self.cluster)]
count=[0]*self.cluster
for i in range(len(self.x)):
j=self.uc[i]
count[j]+=1
x=self.x[i]
for k in range(len(x)):
tempu[j][k]+=x[k]
tempu=np.array(tempu
for i in range(self.cluster):
if count[i]==0:
continue
tempu[i]/=count[i]
self.u=tempu
def fit(self,iter_count=500):
#参数拟合
for i in range(iter_count):
self.single_iter()
return np.array(self.uc)
主函数
import sys
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from Kmeas_class import Kmeans
import numpy as np
np.set_printoptions(suppress=True)
color=['red','pink','orange','gray']
cluster=4
#生成训练集
X, y = make_blobs(n_samples=500, n_features=2, centers=cluster, random_state=np.random.randint(0,30))
#生成的训练集可视化
plt.figure(1)
for i in range(cluster):
plt.scatter(X[y==i, 0], X[y==i,1],
marker='o',
s=8,
c=color[i]
)
plt.title('Real Data')
plt.savefig('E:\python\ml\ml by myself\Kmeans\kmeans_real_myslef.png')
#调用fit函数,实现Kmeans算法
kmeans=Kmeans(X,cluster)
y_predict=kmeans.fit()
#实现预测结果可视化
for i in range(cluster):
plt.scatter(X[y_predict==i, 0], X[y_predict==i,1],
marker='o',
s=8,
c=color[i]
)
plt.title('Predict Result')
plt.savefig('E:\python\ml\ml by myself\Kmeans\kmeans_predict_myslef.png')
sys.exit(0)
结果
![]()
![]()