import pandas as pd
import numpy as np
data = pd.read_csv(r'data.csv')
train = data.iloc[:,0:4]
#计算不同样本之间的欧几里得距离,
#如果不同样本数据的刻度不一致,要对数据进行规格化处理
def nearest(traini,center):
distance = np.zeros((len(center),1))
for i in range(len(center)):
dist = traini-center.ix[i,:]
distance[i]=dist.dot(dist.T)
return distance.argmin()
def zhidian(x):
return x.sum()/len(x)
#收敛条件
def shoulian(train,center):
julihe = 0
for i in range(len(train)):
#print(train.iloc[i,0:4])
made = train.ix[i,'near']
dist = train.iloc[i,0:4] - center.ix[made,0:4]
julihe = julihe +dist.dot(dist.T)
return julihe
def kmeans(train,center,julihe):
#随机选择3个质点
#每个样本的最近的类
print('return')
oldtrain = train
oldcenter = center
near = np.zeros((len(train),1)).astype(int)
for i in range(len(train)):
near[i] = nearest(train.ix[i,:],center)
#重新计算质点
train['near']=near
center = train.groupby(train['near']).apply(zhidian)
#收敛条件
newjulihe = shoulian(train,center)
if newjulihe<julihe:
del train['near']
del center['near']
return kmeans(train,center,newjulihe)
else:
print(oldcenter)
return oldtrain,oldcenter
def sdasd(train,julihe):
center = train.ix[0:3,:]
train,center = kmeans(train,center,julihe)
return train,center
julihe = 100000
train,center = sdasd(train,julihe)