DBSCAN算法的java实现

Posted on 2010-05-30 15:35 我是孙海龙阅读(4112) 评论(2) 收藏举报

DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a data clustering algorithm proposed by Martin Ester, Hans-Peter Kriegel, Jörg Sander and Xiaowei Xu in 1996. It is a density based clustering algorithm because it finds a number of clusters starting from the estimated density distribution of corresponding nodes. DBSCAN is one of the most common clustering algorithms and also most cited in scientific literature.

DBSCAN是一种基于密度的聚类算法，它的基本原理就是给定两个参数，ξ和minp，其中 ξ可以理解为半径，算法将在这个半径内查找样本，minp是一个以ξ为半径查找到的样本个数n的限制条件，只要n>=minp，查找到的样本点就是核心样本点，算法的具体描述见参考文件1，下边是这个算法的java实现：

首先定义一个Point类，代表样本点：

  1: package com.sunzhenxing;
  2: 
  3: public class Point {
  4:   private int x;
  5:   private int y;
  6:   private boolean isKey;
  7:   private boolean isClassed;
  8:   
  9:   public boolean isKey() {
 10:     return isKey;
 11:   }
 12:   public void setKey(boolean isKey) {
 13:     this.isKey = isKey;
 14:     this.isClassed=true;
 15:   }
 16:   public boolean isClassed() {
 17:     return isClassed;
 18:   }
 19:   public void setClassed(boolean isClassed) {
 20:     this.isClassed = isClassed;
 21:   }
 22:   public int getX() {
 23:     return x;
 24:   }
 25:   public void setX(int x) {
 26:     this.x = x;
 27:   }
 28:   public int getY() {
 29:     return y;
 30:   }
 31:   public void setY(int y) {
 32:     this.y = y;
 33:   }
 34:   
 35:   public Point(){
 36:     x=0;
 37:     y=0;
 38:   }
 39:   public Point(int x,int y){
 40:     this.x=x;
 41:     this.y=y;
 42:   }
 43:   public Point(String str){
 44:     String[] p=str.split(",");
 45:     this.x=Integer.parseInt(p[0]);
 46:     this.y=Integer.parseInt(p[1]);
 47:   }
 48:   public String print(){
 49:     return "<"+this.x+","+this.y+">";
 50:   }
 51: }
 52:

然后定义一个工具类，为算法的实现服务：

  1: package com.sunzhenxing;
  2: 
  3: import java.io.BufferedReader;
  4: import java.io.FileReader;
  5: import java.io.IOException;
  6: import java.util.*;
  7: 
  8: public final class Utility {
  9:   //计算两点之间的距离
 10:   public static double getDistance(Point p,Point q){
 11:     int dx=p.getX()-q.getX();
 12:     int dy=p.getY()-q.getY();
 13:     double distance=Math.sqrt(dx*dx+dy*dy);
 14:     return distance;
 15:   }
 16:   //检测p点是不是核心点，tmpLst存储核心点的直达点
 17:   public static List<Point> isKeyPoint(List<Point> lst,Point p,int e,int minp){
 18:     int count=0;
 19:     List<Point> tmpLst=new ArrayList<Point>();
 20:     for(Iterator<Point> it=lst.iterator();it.hasNext();){
 21:       Point q=it.next();
 22:       if(getDistance(p,q)<=e){
 23:         ++count;
 24:         if(!tmpLst.contains(q)){
 25:           tmpLst.add(q);
 26:         }
 27:       }
 28:     }
 29:     if(count>=minp){
 30:       p.setKey(true);
 31:       return tmpLst;
 32:     }
 33:     return null;
 34:   }
 35:   //合并两个链表，前提是b中的核心点包含在a中
 36:   public static boolean mergeList(List<Point> a,List<Point> b){
 37:     boolean merge=false;
 38:     if(a==null || b==null){
 39:       return false;
 40:     }
 41:     for(int index=0;index<b.size();++index){
 42:       Point p=b.get(index);
 43:       if(p.isKey() && a.contains(p)){
 44:         merge=true;
 45:         break;
 46:       }
 47:     }
 48:     if(merge){
 49:       for(int index=0;index<b.size();++index){
 50:         if(!a.contains(b.get(index))){
 51:           a.add(b.get(index));
 52:         }
 53:       }
 54:     }
 55:     return merge;
 56:   }
 57:   //获取文本中的样本点集合
 58:   public static List<Point> getPointsList() throws IOException{
 59:     List<Point> lst=new ArrayList<Point>();
 60:     String txtPath="src\\com\\sunzhenxing\\points.txt";
 61:     BufferedReader br=new BufferedReader(new FileReader(txtPath));
 62:     String str="";
 63:     while((str=br.readLine())!=null && str!=""){
 64:       lst.add(new Point(str));
 65:     }
 66:     br.close();
 67:     return lst;
 68:   }
 69:   //显示聚类的结果
 70:   public static void display(List<List<Point>> resultList){
 71:     int index=1;
 72:     for(Iterator<List<Point>> it=resultList.iterator();it.hasNext();){
 73:       List<Point> lst=it.next();
 74:       if(lst.isEmpty()){
 75:         continue;
 76:       }
 77:       System.out.println("-----第"+index+"个聚类-----");
 78:       for(Iterator<Point> it1=lst.iterator();it1.hasNext();){
 79:         Point p=it1.next();
 80:         System.out.println(p.print());
 81:       }
 82:       index++;
 83:     }
 84:   }
 85: }
 86:

最后在主程序中实现算法，如下所示：

  1: package com.sunzhenxing;
  2: 
  3: import java.io.IOException;
  4: import java.util.*;
  5: 
  6: public class Dbscan {
  7:   private final static int e=2;//ε半径
  8:   private final static int minp=4;//密度阈值
  9:   private static List<Point> pointsList=new ArrayList<Point>();//存储原始样本点
 10:   private static List<List<Point>> resultList=new ArrayList<List<Point>>();//存储最后的聚类结果
 11:   
 12:   private static void applyDbscan() throws IOException{
 13:     pointsList=Utility.getPointsList();
 14:     for(int index=0;index<pointsList.size();++index){
 15:       List<Point> tmpLst=new ArrayList<Point>();
 16:       Point p=pointsList.get(index);
 17:       if(p.isClassed())
 18:         continue;
 19:       tmpLst=Utility.isKeyPoint(pointsList, p, e, minp);
 20:       if(tmpLst!=null){
 21:         resultList.add(tmpLst);
 22:       }
 23:     }
 24:     int length=resultList.size();
 25:     for(int i=0;i<length;++i){
 26:       for(int j=0;j<length;++j){
 27:         if(i!=j){
 28:           if(Utility.mergeList(resultList.get(i), resultList.get(j))){
 29:             resultList.get(j).clear();
 30:           }
 31:         }
 32:       }
 33:     }
 34:   }
 35:   public static void main(String[] args) {
 36:     try {
 37:       //调用DBSCAN的实现算法
 38:       applyDbscan();
 39:       Utility.display(resultList);
 40:     } catch (IOException e) {
 41:       // TODO Auto-generated catch block
 42:       e.printStackTrace();
 43:     }
 44:     
 45:   }
 46: 
 47: }
 48:

下边是一个小测试，即使用src\\com\\sunzhenxing\\points.txt文件的内容进行测试，points.txt的文件内容是：

0,0
0,1
0,2
0,3
0,4
0,5
12,1
12,2
12,3
12,4
12,5
12,6
0,6
0,7
12,7
0,8
0,9
1,1

最后算法的结果是：

-----第1个聚类-----
<0,0>
<0,1>
<0,2>
<1,1>
<0,3>
<0,4>
<0,5>
<0,6>
<0,7>
<0,8>
<0,9>
-----第2个聚类-----
<12,1>
<12,2>
<12,3>
<12,4>
<12,5>
<12,6>
<12,7>
大家画一下坐标就可以理解实验的结论了。

刷新页面返回顶部

码农的空间

公告

DBSCAN算法的java实现