谱聚类的实现
开始用accumulate做加和,结果发现laplacian矩阵求特征值后最小的特征值不是0,这是有问题的,聚类的准确率不是很高,原因也找不到。后来一步步查,发现diag矩阵有问题,最终查到accumulate是int相加,不准确,修改后准确率增加。
// spectral-cluster.cpp : 定义控制台应用程序的入口点。
//
#include "stdafx.h"
#include <iostream>
#include<vector>
#include<set>
#include <numeric>
#include <cv.h>
#include <highgui.h>
/*********************author Marshall**************************/
/************************2015.10.20****************************/
/*********************version 1.0******************************/
using namespace std;
class spectral;
class kmeans
{
friend class spectral;
private:
double*dataset;
unsigned int k;
unsigned int dim;
unsigned int numofdata;
typedef vector<double> Centroid;
vector<Centroid> center;
vector<set<int>>cluster_ID;
vector<Centroid>new_center;
vector<set<int>>new_cluster_ID;
double threshold;
int iter;
private:
void init();
void assign();
double distance(Centroid cen, int k2);
void split(vector<set<int>>&clusters, int kk);
void update_centers();
bool isfinish();
void show_result();
public:
kmeans(double*data, unsigned int datanums, const int dim, const int numofcluster)
:dataset(data), numofdata(datanums), dim(dim), k(numofcluster)
{
threshold = 0.0000001;
}
void apply();
};
//template <typename T>
void kmeans::init()
{
center.resize(k);
set<int>bb;
for (int i = 0; i < k; i++)
{
int id = double(rand()) / double(RAND_MAX + 1.0)*numofdata;
while (bb.find(id) != bb.end())
{
id = double(rand()) / double(RAND_MAX + 1.0)*numofdata;
}
bb.insert(id);
center[i].resize(dim);
for (int j = 0; j < dim; j++)
center[i][j] = dataset[id*dim + j];
}
}
bool kmeans::isfinish()
{
double error = 0;
for (int i = 0; i < k; i++)
{
for (int j = 0; j < dim; j++)
error += pow(center[i][j] - new_center[i][j], 2);
}
return error < threshold ? true : false;
}
void kmeans::assign()
{
for (int j = 0; j < numofdata; j++)
{
double mindis = 10000000;
int belongto = -1;
for (int i = 0; i < k; i++)
{
double dis = distance(center[i], j);
if (dis < mindis)
{
mindis = dis;
belongto = i;
}
}
new_cluster_ID[belongto].insert(j);
}
for (int i = 0; i < k; i++)
{
if (new_cluster_ID[i].empty())
{
split(new_cluster_ID, i);
}
}
}
double kmeans::distance(Centroid cen, int k2)
{
double dis = 0;
for (int i = 0; i < dim; i++)
dis += pow(cen[i] - dataset[k2*dim + i], 2);
return sqrt(dis);
}
void kmeans::split(vector<set<int>>&clusters, int kk)
{
int maxsize = 0;
int th = -1;
for (int i = 0; i < k; i++)
{
if (clusters[i].size() > maxsize)
{
maxsize = clusters[i].size();
th = i;
}
}
#define DELTA 3
vector<double>tpc1, tpc2;
tpc1.resize(dim);
tpc2.resize(dim);
for (int i = 0; i < dim; i++)
{
tpc2[i] = center[th][i] - DELTA;
tpc1[i] = center[th][i] + DELTA;
}
for (set<int>::iterator it = clusters[th].begin(); it != clusters[th].end(); it++)
{
double d1 = distance(tpc1, *it);
double d2 = distance(tpc2, *it);
if (d2 < d1)
{
clusters[kk].insert(*it);
}
}
_ASSERTE(!clusters[kk].empty());
for (set<int>::iterator it = clusters[kk].begin(); it != clusters[kk].end(); it++)
clusters[th].erase(*it);
}
void kmeans::update_centers()
{
for (int i = 0; i < k; i++)
{
Centroid temp;
temp.resize(dim);
for (set<int>::iterator j = new_cluster_ID[i].begin(); j != new_cluster_ID[i].end(); j++)
{
for (int m = 0; m < dim; m++)
temp[m] += dataset[(*j)*dim + m];
}
for (int m = 0; m < dim; m++)
temp[m] /= new_cluster_ID[i].size();
new_center[i] = temp;
}
}
void kmeans::apply()
{
init();
new_center.resize(k);
new_cluster_ID.resize(k);
assign();
update_centers();
iter = 0;
while (!isfinish())
{
center = new_center;
cluster_ID = new_cluster_ID;
new_center.clear();
new_center.resize(k);
new_cluster_ID.clear();
new_cluster_ID.resize(k);
assign();
update_centers();
iter++;
}
//new_cluster_ID.clear();
}
class spectral
{
private:
unsigned int dim;
unsigned int N;
double**normalized_data;
double*laplacian_matrix;
double*diag_matrix;
double*similarity_matrix;
bool is_normalized;
double sigma2;
unsigned int k_nearest;
unsigned int K;
unsigned int numofcluster;
private:
void create_similarity_matrix();
void create_diag_matrix();
void create_laplacian_matrix();
void eigen_of_laplacian(double*&newdata);
//void kmeans();
void normalize_data(double*&data, int datanums, int dim);
double euclidean(const double*data1, const double*data2);
bool is_knearest(const double*distance_matrix, const int kk, const int mm);
public:
spectral(double**data, const int N, const int dim, const int numofcluster);
void spectral_cluster(int*pClusters);
~spectral();
};
spectral::~spectral()
{
}
spectral::spectral(double**data, const int N, const int dim, const int numofcluster)
:N(N), dim(dim), numofcluster(numofcluster)
{
K = 4;
k_nearest = 15;
sigma2 = 600;
normalized_data = data;
is_normalized = false;
}
void spectral::normalize_data(double*&data, int datanums, int dim)
{
double*mins = new double[dim];
memset(mins, 0x3f3f3f3f, sizeof(double)*dim);
double*maxs = new double[dim];
memset(maxs, 0, sizeof(double)*dim);
for (int i = 0; i < datanums; i++)
for (int j = 0; j < dim; j++)
{
if (data[i*dim + j] > maxs[j])
{
maxs[j] = data[i*dim + j];
}
if (data[i*dim + j] < mins[j])
mins[j] = data[i*dim + j];
}
for (int i = 0; i < datanums; i++)
for (int j = 0; j < dim; j++)
data[i*dim + j] = 100 * (data[i*dim + j] - mins[j]) / (maxs[j] - mins[j]);
delete[]maxs, mins;
}
double spectral::euclidean(const double*data1, const double*data2)
{
double dis = 0;
for (int i = 0; i < dim; i++)
dis += pow(data1[i] - data2[i], 2);
return dis;
}
bool spectral::is_knearest(const double*distance_matrix,
const int kk, const int mm)
{
double dis = distance_matrix[kk*N + mm];
int count = 0;
//还要考虑distance_matrix[kk*N + kk]=0;
//因为k_nearest相比N很小(大多数情况),判断false好
for (int i = 0; i < N; i++)
if (i != kk&&distance_matrix[kk*N + i] < dis)
{
count++;
if (count > k_nearest)
return false;
}
return true;
}
void spectral::create_similarity_matrix()
{
double*distance_matrix = new double[N*N];
memset(distance_matrix, 0, N*N*sizeof(double));
for (int i = 0; i < N - 1; i++)
for (int j = i + 1; j < N; j++)
{
distance_matrix[i*N + j] = euclidean
(normalized_data[i], normalized_data[j]);
distance_matrix[j*N + i] = distance_matrix[i*N + j];
}
if (similarity_matrix)
delete[]similarity_matrix;
similarity_matrix = new double[N*N];
memset(similarity_matrix, 0, N*N*sizeof(double));
//这里使用对称k最近邻
for (int i = 0; i < N - 1; i++)
{
for (int j = i + 1; j < N; j++)
{
if (is_knearest(distance_matrix, i, j)
|| is_knearest(distance_matrix, j, i))
{
similarity_matrix[i*N + j] = similarity_matrix[j*N + i]
= 100 * exp(-distance_matrix[i*N + j] / sigma2);
}
//cout << similarity_matrix[i*N + j] << " ";
}
//cout << endl << endl << endl << endl;
}
/*for (int i = 0; i < N; i++)
{
for (int j = 0; j < N; j++)
{
cout << similarity_matrix[i*N + j] << " ";
}
cout << endl << endl << endl << endl;
}*/
delete[]distance_matrix;
}
void spectral::create_diag_matrix()
{
if (diag_matrix)
delete[]diag_matrix;
diag_matrix = new double[N*N];
memset(diag_matrix, 0, N*N*sizeof(double));
for (int i = 0; i < N; i++)
{
//accumulate是int相加,不准确,修改后准确率增加
//diag_matrix[i*N + i] = accumulate(similarity_matrix + i*N, similarity_matrix + i*N + N, 0);
double sum = 0;
for (int j = 0; j < N; j++)
sum += similarity_matrix[i*N + j];
diag_matrix[i*N + i] = sum;
//cout << "diag" << i << " " << diag_matrix[i*N + i] << endl;
}
}
void spectral::create_laplacian_matrix()
{
if (laplacian_matrix)
delete[]laplacian_matrix;
laplacian_matrix = new double[N*N];
for (int i = 0; i < N; i++)
for (int j = i; j < N; j++)
{
laplacian_matrix[i*N + j] = laplacian_matrix[j*N + i]
= diag_matrix[i*N + j] - similarity_matrix[i*N + j];
}
if (is_normalized)
{
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
{
int temp = i == j ? 1 : 0;
laplacian_matrix[i*N + j] = temp - 1.0 / sqrt(diag_matrix[j*N + j]
+ 1.0e-13)*laplacian_matrix[i*N + j] * 1.0 / sqrt(diag_matrix[i*N + i] + 1.0e-13);
}
}
delete[]diag_matrix, similarity_matrix;
}
void spectral::eigen_of_laplacian(double*&newdata)
{
//CvMat Ma = cvMat(N, N, CV_64FC1, laplacian_matrix);
CvMat *pSrcMatrix = cvCreateMat(N, N, CV_64FC1);
// E = 对应的特征向量 (每行)
CvMat* E = cvCreateMat(N, N, CV_64FC1);
CvMat* l = cvCreateMat(N, 1, CV_64FC1);
for (int i = 0; i < N; i++)
{
cvmSet(l, i, 0, 0);
for (int j = 0; j < N; j++)
{
cvmSet(pSrcMatrix, i, j, laplacian_matrix[i*N + j]);
cvmSet(E, i, j, 0);
}
}
cvEigenVV(pSrcMatrix, E, l); // l = A的特征值 (降序排列)
for (int i = 0; i < N; i++)
cout << cvmGet(l, i, 0) << " ";//半正定,最小的特征值应该是0,应该在此处验证
for (int i = 0; i < N; i++)
{
//cout << cvmGet(E, N - 1 , i);
for (int j = 0; j < K; j++)
{
newdata[i*K + j] = cvmGet(E, N - 2 - j, i);
}
}
delete[]laplacian_matrix;
// 释放矩阵所占内存空间
cvReleaseMat(&pSrcMatrix);
cvReleaseMat(&E);
cvReleaseMat(&l);
}
void spectral::spectral_cluster(int*pClusters)
{
create_similarity_matrix();
create_diag_matrix();
create_laplacian_matrix();
double*newdata = new double[N*K];
eigen_of_laplacian(newdata);
//normalize_data(newdata, this->N, dim);
kmeans km(newdata, N, K, this->numofcluster);
km.apply();
delete[]newdata;
newdata = NULL;
int count = 0;
for (int i = 0; i < km.new_cluster_ID.size(); i++)
for (set<int>::iterator it = km.new_cluster_ID[i].begin(); it != km.new_cluster_ID[i].end(); it++)
{
pClusters[*it] = i;
count++;
}
_ASSERTE(count == N);
}
int _tmain(int argc, _TCHAR* argv[])
{
#define MAX_CLUSTERS 5
CvScalar color_tab[MAX_CLUSTERS];
IplImage* img = cvCreateImage(cvSize(500, 500), IPL_DEPTH_8U, 3);
CvRNG rng = cvRNG(cvGetTickCount());
CvPoint ipt;
color_tab[0] = CV_RGB(255, 0, 0);
color_tab[1] = CV_RGB(0, 255, 0);
color_tab[2] = CV_RGB(100, 100, 255);
color_tab[3] = CV_RGB(255, 0, 255);
color_tab[4] = CV_RGB(255, 255, 0);
int clusterNum = 4;
int sampleNum = 500;
int feaNum = 2;
CvMat* sampleMat = cvCreateMat(sampleNum, 1, CV_32FC2);
/* generate random sample from multigaussian distribution */
//产生高斯随机数
for (int k = 0; k < clusterNum; k++)
{
CvPoint center;
int topIndex = k*sampleNum / clusterNum;
int bottomIndex = (k == clusterNum - 1 ? sampleNum : (k + 1)*sampleNum / clusterNum);
CvMat* localMat = cvCreateMatHeader(bottomIndex - topIndex, 1, CV_32FC2);
center.x = cvRandInt(&rng) % img->width;
center.y = cvRandInt(&rng) % img->height;
cvGetRows(sampleMat, localMat, topIndex, bottomIndex, 1); //此函数不会给localMat分配空间,不包含bottomIndex
cvRandArr(&rng, localMat, CV_RAND_NORMAL,
cvScalar(center.x, center.y, 0, 0),
cvScalar(img->width*0.1, img->height*0.1, 0, 0));
}
// shuffle samples 混乱数据
for (int i = 0; i < sampleNum / 2; i++)
{
CvPoint2D32f* pt1 = (CvPoint2D32f*)sampleMat->data.fl + cvRandInt(&rng) % sampleNum;
CvPoint2D32f* pt2 = (CvPoint2D32f*)sampleMat->data.fl + cvRandInt(&rng) % sampleNum;
CvPoint2D32f temp;
CV_SWAP(*pt1, *pt2, temp);
}
double** pSamples = new double*[sampleNum];
for (int i = 0; i < sampleNum; i++)
pSamples[i] = new double[feaNum];
int* pClusters = new int[sampleNum];
for (int i = 0; i < sampleNum; i++)
{
//feaNum=2
pSamples[i][0] = (cvGet2D(sampleMat, i, 0)).val[0];
//cout << pSamples[i][0] << " ";
pSamples[i][1] = (cvGet2D(sampleMat, i, 0)).val[1];
//cout << pSamples[i][1] << endl;
}
cvReleaseMat(&sampleMat);
cvZero(img);
for (int i = 0; i < sampleNum; i++)
{
//feaNum=2
ipt.x = pSamples[i][0];
ipt.y = pSamples[i][1];
cvCircle(img, ipt, 1, cvScalar(255, 255, 255), CV_FILLED, CV_AA, 0);
}
cvSaveImage("origin.jpg", img);
//执行谱聚类
spectral sp(pSamples, sampleNum, feaNum, clusterNum);
sp.spectral_cluster(pClusters);
cvZero(img);
for (int i = 0; i < sampleNum; i++)
{
//feaNum=2
int cluster_idx = pClusters[i];
ipt.x = pSamples[i][0];
ipt.y = pSamples[i][1];
cvCircle(img, ipt, 1, color_tab[cluster_idx], CV_FILLED, CV_AA, 0);
}
delete[] pClusters;
pClusters = NULL;
delete[] pSamples;
pSamples = NULL;
cvNamedWindow("clusters");
cvShowImage("clusters", img);
cvWaitKey(0);
cvSaveImage("clusters.jpg", img);
cvReleaseImage(&img);
system("pause");
cvDestroyWindow("clusters");
return 0;
}
下面是聚类结果。
嫌c++冗长的看这个python的
#!/usr/bin/python
# copyright (c) 2008 Feng Zhu, Yong Sun
import heapq
from functools import partial
from numpy import *
from scipy.linalg import *
from scipy.cluster.vq import *
import pylab
def line_samples ():
vecs = random.rand (120, 2)
vecs [:,0] *= 3
vecs [0:40,1] = 1
vecs [40:80,1] = 2
vecs [80:120,1] = 3
return vecs
def gaussian_simfunc (v1, v2, sigma=1):
tee = (-norm(v1-v2)**2)/(2*(sigma**2))
return exp (tee)
def construct_W (vecs, simfunc=gaussian_simfunc):
n = len (vecs)
W = zeros ((n, n))
for i in xrange(n):
for j in xrange(i,n):
W[i,j] = W[j,i] = simfunc (vecs[i], vecs[j])
return W
def knn (W, k, mutual=False):
n = W.shape[0]
assert (k>0 and k<(n-1))
for i in xrange(n):
thr = heapq.nlargest(k+1, W[i])[-1]
for j in xrange(n):
if W[i,j] < thr:
W[i,j] = -W[i,j]
for i in xrange(n):
for j in xrange(i, n):
if W[i,j] + W[j,i] < 0:
W[i,j] = W[j,i] = 0
elif W[i,j] + W[j,i] == 0:
W[i,j] = W[j,i] = 0 if mutual else abs(W[i,j])
vecs = line_samples()
W = construct_W (vecs, simfunc=partial(gaussian_simfunc, sigma=2))
knn (W, 10)
D = diag([reduce(lambda x,y:x+y, Wi) for Wi in W])
L = D - W
evals, evcts = eig(L,D)
vals = dict (zip(evals, evcts.transpose()))
keys = vals.keys()
keys.sort()
Y = array ([vals[k] for k in keys[:3]]).transpose()
res,idx = kmeans2(Y, 3, minit='points')
colors = [(1,2,3)[i] for i in idx]
pylab.scatter(vecs[:,0],vecs[:,1],c=colors)
pylab.show()版权声明:
浙公网安备 33010602011771号