# -*- coding: utf-8 -*-
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
class Canopy:
def __init__(self, dataset):
self.dataset = dataset
self.t1 = 0
self.t2 = 0
# 设置初始阈值
def setThreshold(self, t1, t2):
if t1 > t2:
self.t1 = t1
self.t2 = t2
else:
print('t1 needs to be larger than t2!')
# 使用欧式距离进行距离的计算
def euclideanDistance(self, vec1, vec2):
return math.sqrt(((vec1 - vec2)**2).sum())
# 根据当前dataset的长度随机选择一个下标
def getRandIndex(self):
return random.randint(0, len(self.dataset) - 1)
# 距离矩阵
def getDistanceMatrix(self):
# dataset是一个向量矩阵:euclidean代表欧式距离
dist = pdist(self.dataset, metric='euclidean')
# 将dist数组变成一个矩阵
distMatrix = squareform(dist)
return distMatrix
def clustering(self):
if self.t1 == 0:
print('Please set the threshold.')
else:
canopies = [] # 用于存放最终归类结果
while len(self.dataset) != 0:
rand_index = self.getRandIndex()
current_center = self.dataset[rand_index] # 随机获取一个中心点,定为P点
current_center_list = [] # 初始化P点的canopy类容器
delete_list = [] # 初始化P点的删除容器
self.dataset = np.delete(
self.dataset, rand_index, 0) # 删除随机选择的中心点P
for datum_j in range(len(self.dataset)):
datum = self.dataset[datum_j]
distance = self.euclideanDistance(
current_center, datum) # 计算选取的中心点P到每个点之间的距离
if distance < self.t1:
# 若距离小于t1,则将点归入P点的canopy类
current_center_list.append(datum)
if distance < self.t2:
delete_list.append(datum_j) # 若小于t2则归入删除容器
# 根据删除容器的下标,将元素从数据集中删除
self.dataset = np.delete(self.dataset, delete_list, 0)
canopies.append((current_center, current_center_list))
return canopies
# 三维画图
def showCanopy(canopies, dataset, t1, t2):
fig = plt.figure()
ax = Axes3D(fig)
markers = ['.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', 's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd', '|', '_']
for i in range(len(canopies)):
canopy = canopies[i]
center = canopy[0]
components = canopy[1]
color_i = np.random.rand(3,) # 随机颜色
marker_i = markers[np.random.randint(0,22)] # 随机标记
# 中心点
ax.scatter(center[0], center[1], center[2], marker=marker_i, color=color_i, s=5)
# 分类点
for component in components:
ax.scatter(component[0], component[1], component[2], marker=marker_i, color=color_i, s=1.5)
label_font = {'color': 'c', 'size': 5, 'weight': 'bold'}
ax.set_xlabel("X axis", fontdict=label_font)
ax.set_ylabel("Y axis", fontdict=label_font)
ax.set_zlabel("Z axis", fontdict=label_font)
plt.show()
if __name__ == "__main__":
# dataset = np.random.rand(500, 2) # 随机生成500个二维[0,1)平面点
file_path = 'D:\Desktop\datatest.xlsx'
data_ori = pd.read_excel(file_path, sheet_name='Sheet1')
dataset = StandardScaler().fit_transform(data_ori) # 数据标准化
t1 = 2.5
t2 = 2 # 可用距离距离矩阵的平均值为t2
gc = Canopy(dataset)
gc.setThreshold(t1, t2)
canopies = gc.clustering()
print('Get %s initial centers.' % len(canopies))
showCanopy(canopies, dataset, t1, t2)