canopy算法做聚类

# -*- coding: utf-8 -*-
import math
import random
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

class Canopy:
    def __init__(self, dataset):
        self.dataset = dataset
        self.t1 = 0
        self.t2 = 0

    # 设置初始阈值
    def setThreshold(self, t1, t2):
        if t1 > t2:
            self.t1 = t1
            self.t2 = t2
        else:
            print('t1 needs to be larger than t2!')

    # 使用欧式距离进行距离的计算
    def euclideanDistance(self, vec1, vec2):
        return math.sqrt(((vec1 - vec2)**2).sum())

    # 根据当前dataset的长度随机选择一个下标
    def getRandIndex(self):
        return random.randint(0, len(self.dataset) - 1)

    # 距离矩阵
    def getDistanceMatrix(self):
        # dataset是一个向量矩阵：euclidean代表欧式距离
        dist = pdist(self.dataset, metric='euclidean')
        # 将dist数组变成一个矩阵
        distMatrix = squareform(dist)
        return distMatrix

    def clustering(self):
        if self.t1 == 0:
            print('Please set the threshold.')
        else:
            canopies = []  # 用于存放最终归类结果
            while len(self.dataset) != 0:
                rand_index = self.getRandIndex()
                current_center = self.dataset[rand_index]  # 随机获取一个中心点，定为P点
                current_center_list = []  # 初始化P点的canopy类容器
                delete_list = []  # 初始化P点的删除容器
                self.dataset = np.delete(
                    self.dataset, rand_index, 0)  # 删除随机选择的中心点P
                for datum_j in range(len(self.dataset)):
                    datum = self.dataset[datum_j]
                    distance = self.euclideanDistance(
                        current_center, datum)  # 计算选取的中心点P到每个点之间的距离
                    if distance < self.t1:
                        # 若距离小于t1，则将点归入P点的canopy类
                        current_center_list.append(datum)
                    if distance < self.t2:
                        delete_list.append(datum_j)  # 若小于t2则归入删除容器
                # 根据删除容器的下标，将元素从数据集中删除
                self.dataset = np.delete(self.dataset, delete_list, 0)
                canopies.append((current_center, current_center_list))
        return canopies


# 三维画图
def showCanopy(canopies, dataset, t1, t2):
    fig = plt.figure()
    ax = Axes3D(fig)
    markers = ['.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', 's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd', '|', '_']

    for i in range(len(canopies)):
        canopy = canopies[i]
        center = canopy[0]
        components = canopy[1]
        color_i = np.random.rand(3,)  # 随机颜色
        marker_i = markers[np.random.randint(0,22)]  # 随机标记
        # 中心点
        ax.scatter(center[0], center[1], center[2], marker=marker_i, color=color_i, s=5)
        
        # 分类点
        for component in components:
            ax.scatter(component[0], component[1], component[2], marker=marker_i, color=color_i, s=1.5)

    label_font = {'color': 'c', 'size': 5, 'weight': 'bold'}
    ax.set_xlabel("X axis", fontdict=label_font)
    ax.set_ylabel("Y axis", fontdict=label_font)
    ax.set_zlabel("Z axis", fontdict=label_font)
    
    plt.show()


if __name__ == "__main__":
    # dataset = np.random.rand(500, 2)  # 随机生成500个二维[0,1)平面点
    file_path = 'D:\Desktop\datatest.xlsx'
    data_ori = pd.read_excel(file_path, sheet_name='Sheet1')
    dataset = StandardScaler().fit_transform(data_ori) # 数据标准化
    t1 = 2.5
    t2 = 2  # 可用距离距离矩阵的平均值为t2
    gc = Canopy(dataset)
    gc.setThreshold(t1, t2)
    canopies = gc.clustering()
    print('Get %s initial centers.' % len(canopies))
    showCanopy(canopies, dataset, t1, t2)
posted @ 2021-01-06 11:46 二二二狗子阅读(359) 评论(0) 收藏举报
刷新页面返回顶部
canopy算法做聚类

公告