1 import java.io.BufferedReader;
2 import java.io.FileReader;
3 import java.io.FileWriter;
4 import java.io.ObjectInputStream.GetField;
5 import java.util.ArrayList;
6 import java.util.Random;
7
8
9 public class Kmeans {
10
11 private int k;
12 private Vector[] cluster_centers;
13 private int[] point_ids;
14
15 private int num_clusters;//used to adapt to different initialization
16
17 Kmeans(int k){
18 this.k=k;
19 cluster_centers=new Vector[k];
20 }
21
22 private Vector get_cluster_center(int i){
23 return cluster_centers[i];
24 }
25
26 private int get_point_id(int i){
27 return point_ids[i];
28 }
29
30 /**
31 * returns the index of the cluster center the closest to the point X
32 */
33 private int get_closest_center_id(Vector X){
34 int id=0;
35 double tmp,distance=distance(X,get_cluster_center(0));
36 for(int i=1;i<k&&i<num_clusters;i++){
37 tmp=distance(X,get_cluster_center(i));
38 if(distance>tmp){
39 distance=tmp;
40 id=i;
41 }
42 }
43 return id;
44 }
45
46
47 /**
48 * add the copy construction in class Vector and KahanSum
49 * */
50 private double distance(final Vector A, final Vector B){
51 Vector diff=new Vector(A);
52 diff.sub(B);
53 return diff.norm();
54 }
55
56 /**
57 * a simple random initialization, used in Kmeans
58 */
59 private void init_point_indexes(Random rnd){
60 for(int i=0;i<point_ids.length;i++)
61 point_ids[i] = rnd.nextInt(k);
62 num_clusters=k;
63 }
64 /**
65 * Kmeans++ initialization
66 */
67 private void init_point_KMplusplus(Random rd,ArrayList<Vector> points){
68 int cluster0=rd.nextInt(points.size());//choose the first center
69 cluster_centers[0]=points.get(cluster0);
70 //points.remove(cluster0);
71 num_clusters=1;
72
73 double[] D=new double[points.size()];
74
75 while(num_clusters < k){
76 int sum=0;
77 for(int i=0;i<points.size();i++){
78 double d=distance(points.get(i),cluster_centers[ get_closest_center_id(points.get(i)) ]);
79 sum+=d*d;
80 D[i]=sum;
81 }
82
83 double r = rd.nextDouble()*sum;
84 for (int i = 0 ; i < D.length; i++) {
85 if (D[i] >= r){
86 cluster_centers[num_clusters]=points.get(i);
87 //points.remove(i);
88 num_clusters++;
89 break;
90 }
91 }
92
93 }
94
95 assignment(points);
96
97 }
98 /**********************************************************************
99 * choose init_point_indexs(new Random) to do the Kmeans initialization
100 * or
101 * choose init_point_KMplusplus(new Random(),points)
102 * to do the Kmeans++ initialization
103 *
104 * ******************************************************************/
105 int clusterize(ArrayList<Vector> points){
106 int iterations=1;
107 point_ids = new int[points.size()];
108
109 //init_point_indexes(new Random());
110 init_point_KMplusplus(new Random(),points);
111
112 update(points);
113
114 while(!assignment(points)){
115 update(points);
116 iterations++;
117 //if(iterations>100)return iterations;
118 }
119 return iterations;
120 }
121
122 //update the center of different clusters
123 private void update(ArrayList<Vector> points){
124 ArrayList<Vector> cluster_members=new ArrayList<Vector>();
125
126 //calculate the centers
127 for(int i=0;i<k;i++){
128 for(int j=0;j<point_ids.length;j++){
129 if(get_point_id(j)==i)
130 cluster_members.add(points.get(j));
131 }
132
133 //System.out.println("!!!!!!"+cluster_members.size());
134 if(cluster_members.size()!=0)
135 cluster_centers[i]=Vector.vector_median(cluster_members);
136 cluster_members.clear();
137 }
138 }
139
140 private boolean assignment(ArrayList<Vector> points){
141 boolean convergence=true;
142
143 for(int i=0;i<points.size();i++){
144 int closest_id=get_closest_center_id(points.get(i));
145 if(point_ids[i]!=closest_id){
146 point_ids[i]=closest_id;
147 convergence=false;
148 }
149 }
150 return convergence;
151 }
152
153 public void write_data_withID(String filename,ArrayList<Vector> list){
154 FileWriter fw;
155 int i=0,j=0;
156 Vector v;
157 try
158 {
159 fw = new FileWriter(filename);
160 while(j<list.size()){
161 i=0;
162 v=list.get(j++);
163 while(i<v.get_length()){
164 fw.write(Double.toString(v.get(i++))+" ");
165 }
166 fw.write(Integer.toString(get_point_id(j-1)));
167 fw.write('\n');
168
169 }
170 fw.flush();
171 fw.close();
172 System.out.println("Vector write with cluster_id finished");
173
174 }catch(Exception e){
175 e.printStackTrace();
176 }
177 }
178
179 /************************
180 * cluster analysis:
181 * http://en.wikipedia.org/wiki/Cluster_analysis
182 * higher is better
183 * Dunn need the points to be clustered first
184 * so load the List of points and do clusterize() function
185 * **********************/
186 public double Dunn(ArrayList<Vector> points){
187
188 clusterize(points);
189 double max_intra_distance=0;
190 double min_cluster_distance=Double.MAX_VALUE;
191 double temp=0;
192 double temp_intra_distance=0;
193 for(int i=0;i<k;i++){
194
195 temp_intra_distance = max_intra_distance(i,points);
196 if(temp_intra_distance > max_intra_distance)
197 max_intra_distance = temp_intra_distance;
198
199 for(int j=i+1;j<k;j++){
200 temp=distance(cluster_centers[i], cluster_centers[j]);
201 if(temp<min_cluster_distance)
202 min_cluster_distance=temp;
203 }
204 }
205 if(min_cluster_distance==Double.MAX_VALUE||max_intra_distance==0)
206 {
207 System.out.println("Only have one cluster or Max intra cluster distance is 0" +
208 "\nthe return value will be '0'.");
209 return 0;
210 }
211
212 return min_cluster_distance/max_intra_distance;
213
214 }
215 /**
216 *calculate the average distance of points of cluster i,
217 *do clusterize to cluster the points first,
218 *the lower the better
219 */
220 public double Davies_Bouldin(ArrayList<Vector> points){
221 clusterize(points);
222 double[] Average=AverageDistance(points);//average distance of points of cluster i
223
224 double maxValue=0;
225 KahanSum sum=new KahanSum();
226
227 for(int i=0;i<k;i++){
228 for(int j=i+1;j<k;j++){
229 double temp=Average[i]+Average[j];
230 temp/=distance(cluster_centers[i],cluster_centers[j]);
231 if(temp>maxValue)
232 maxValue=temp;
233 }
234 sum.add(maxValue);
235 }
236 if(k==1)System.out.println("cluster number is 1, the value will be 0");
237
238 return sum.getsum()/k;
239
240 }
241 /**
242 * the function max_intra_distance get the max intra distance in cluster i
243 * */
244 private double max_intra_distance(int i,ArrayList<Vector> points){
245 double dis=0;
246 double max_inrta=0;
247 for(int j=0;j<points.size();j++){
248 if(point_ids[j]==i){
249 dis=distance(cluster_centers[i],points.get(j));
250 if(dis>max_inrta)
251 max_inrta=dis;
252 }
253 }
254
255 return max_inrta;
256 }
257 /**
258 * return a double[], element i has the value of average distance
259 * of the points of cluster i
260 */
261 private double[] AverageDistance( ArrayList<Vector> points){
262
263 double[] average =new double[k];
264 KahanSum distance=new KahanSum();
265 int count=0;
266
267 for(int i=0;i<k;i++){
268 for(int j=0;j<point_ids.length;j++){
269 if(get_point_id(j)==i)
270 {
271 count++;
272 distance.add(distance(cluster_centers[i],points.get(j)));
273 }
274 }
275
276 average[i]=0;
277 if(count!=0)
278 average[i]=distance.getsum()/count;
279 distance.reset();
280 count=0;
281 }
282 return average;
283
284 }
285
286 public static void main(String[] args) {
287 // TODO Auto-generated method stub
288 ArrayList<Vector> points;
289 if(args.length==0)
290 points=Vector.read_data("dataset-4");//the dataset
291 else
292 points=Vector.read_data(args[0]);
293 //points.get(0).printvec();
294
295 int dim=points.get(0).get_length();
296 //System.out.println(points.size()+" "+dim);
297
298 Kmeans km=new Kmeans(2);
299 System.out.println("the iterations is: "+km.clusterize(points)+"\n" +
300 " by using the initialization of kmeans++.");
301 if(args.length==2)
302 km.write_data_withID(args[1], points);
303 else
304 km.write_data_withID("out-datasets", points);
305
306
307 for(int i=1;i<6;i++){
308 km=new Kmeans(i);
309 System.out.println("Dunn cluster_num ="+i+" "+km.Dunn(points));
310 }
311
312 for(int i=1;i<6;i++){
313 km=new Kmeans(i);
314 System.out.println("Davies_Bouldin cluster_num ="+i+" "+km.Davies_Bouldin(points));
315 }
316
317
318 }
319
320 }