DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a data clustering algorithm proposed by Martin Ester, Hans-Peter Kriegel, Jörg Sander and Xiaowei Xu in 1996. It is a density based clustering algorithm because it finds a number of clusters starting from the estimated density distribution of corresponding nodes. DBSCAN is one of the most common clustering algorithms and also most cited in scientific literature.
DBSCAN是一种基于密度的聚类算法,它的基本原理就是给定两个参数,ξ和minp,其中 ξ可以理解为半径,算法将在这个半径内查找样本,minp是一个以ξ为半径查找到的样本个数n的限制条件,只要n>=minp,查找到的样本点就是核心样本点,算法的具体描述见参考文件1,下边是这个算法的java实现:
首先定义一个Point类,代表样本点:
1: package com.sunzhenxing;2:3: public class Point {4: private int x;5: private int y;6: private boolean isKey;7: private boolean isClassed;8:9: public boolean isKey() {10: return isKey;11: }12: public void setKey(boolean isKey) {13: this.isKey = isKey;14: this.isClassed=true;15: }16: public boolean isClassed() {17: return isClassed;18: }19: public void setClassed(boolean isClassed) {20: this.isClassed = isClassed;21: }22: public int getX() {23: return x;24: }25: public void setX(int x) {26: this.x = x;27: }28: public int getY() {29: return y;30: }31: public void setY(int y) {32: this.y = y;33: }34:35: public Point(){36: x=0;37: y=0;38: }39: public Point(int x,int y){40: this.x=x;41: this.y=y;42: }43: public Point(String str){44: String[] p=str.split(",");45: this.x=Integer.parseInt(p[0]);46: this.y=Integer.parseInt(p[1]);47: }48: public String print(){49: return "<"+this.x+","+this.y+">";50: }51: }52:
然后定义一个工具类,为算法的实现服务:
1: package com.sunzhenxing;2:3: import java.io.BufferedReader;4: import java.io.FileReader;5: import java.io.IOException;6: import java.util.*;7:8: public final class Utility {9: //计算两点之间的距离10: public static double getDistance(Point p,Point q){11: int dx=p.getX()-q.getX();12: int dy=p.getY()-q.getY();13: double distance=Math.sqrt(dx*dx+dy*dy);14: return distance;15: }16: //检测p点是不是核心点,tmpLst存储核心点的直达点17: public static List<Point> isKeyPoint(List<Point> lst,Point p,int e,int minp){18: int count=0;19: List<Point> tmpLst=new ArrayList<Point>();20: for(Iterator<Point> it=lst.iterator();it.hasNext();){21: Point q=it.next();22: if(getDistance(p,q)<=e){23: ++count;24: if(!tmpLst.contains(q)){25: tmpLst.add(q);26: }27: }28: }29: if(count>=minp){30: p.setKey(true);31: return tmpLst;32: }33: return null;34: }35: //合并两个链表,前提是b中的核心点包含在a中36: public static boolean mergeList(List<Point> a,List<Point> b){37: boolean merge=false;38: if(a==null || b==null){39: return false;40: }41: for(int index=0;index<b.size();++index){42: Point p=b.get(index);43: if(p.isKey() && a.contains(p)){44: merge=true;45: break;46: }47: }48: if(merge){49: for(int index=0;index<b.size();++index){50: if(!a.contains(b.get(index))){51: a.add(b.get(index));52: }53: }54: }55: return merge;56: }57: //获取文本中的样本点集合58: public static List<Point> getPointsList() throws IOException{59: List<Point> lst=new ArrayList<Point>();60: String txtPath="src\\com\\sunzhenxing\\points.txt";61: BufferedReader br=new BufferedReader(new FileReader(txtPath));62: String str="";63: while((str=br.readLine())!=null && str!=""){64: lst.add(new Point(str));65: }66: br.close();67: return lst;68: }69: //显示聚类的结果70: public static void display(List<List<Point>> resultList){71: int index=1;72: for(Iterator<List<Point>> it=resultList.iterator();it.hasNext();){73: List<Point> lst=it.next();74: if(lst.isEmpty()){75: continue;76: }77: System.out.println("-----第"+index+"个聚类-----");78: for(Iterator<Point> it1=lst.iterator();it1.hasNext();){79: Point p=it1.next();80: System.out.println(p.print());81: }82: index++;83: }84: }85: }86:
最后在主程序中实现算法,如下所示:
1: package com.sunzhenxing;2:3: import java.io.IOException;4: import java.util.*;5:6: public class Dbscan {7: private final static int e=2;//ε半径8: private final static int minp=4;//密度阈值9: private static List<Point> pointsList=new ArrayList<Point>();//存储原始样本点10: private static List<List<Point>> resultList=new ArrayList<List<Point>>();//存储最后的聚类结果11:12: private static void applyDbscan() throws IOException{13: pointsList=Utility.getPointsList();14: for(int index=0;index<pointsList.size();++index){15: List<Point> tmpLst=new ArrayList<Point>();16: Point p=pointsList.get(index);17: if(p.isClassed())18: continue;19: tmpLst=Utility.isKeyPoint(pointsList, p, e, minp);20: if(tmpLst!=null){21: resultList.add(tmpLst);22: }23: }24: int length=resultList.size();25: for(int i=0;i<length;++i){26: for(int j=0;j<length;++j){27: if(i!=j){28: if(Utility.mergeList(resultList.get(i), resultList.get(j))){29: resultList.get(j).clear();30: }31: }32: }33: }34: }35: public static void main(String[] args) {36: try {37: //调用DBSCAN的实现算法38: applyDbscan();39: Utility.display(resultList);40: } catch (IOException e) {41: // TODO Auto-generated catch block42: e.printStackTrace();43: }44:45: }46:47: }48:
下边是一个小测试, 即使用src\\com\\sunzhenxing\\points.txt文件的内容进行测试,points.txt的文件内容是:
0,0
0,1
0,2
0,3
0,4
0,5
12,1
12,2
12,3
12,4
12,5
12,6
0,6
0,7
12,7
0,8
0,9
1,1
最后算法的结果是:
-----第1个聚类-----
<0,0>
<0,1>
<0,2>
<1,1>
<0,3>
<0,4>
<0,5>
<0,6>
<0,7>
<0,8>
<0,9>
-----第2个聚类-----
<12,1>
<12,2>
<12,3>
<12,4>
<12,5>
<12,6>
<12,7>
大家画一下坐标就可以理解实验的结论了。
浙公网安备 33010602011771号