使用weka进行文本聚类的例子
先看上篇会容易看懂些,这篇的注释不多!
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import weka.clusterers.Clusterer;
import weka.clusterers.SimpleKMeans;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.filters.Filter;
import weka.filters.unsupervised.attribute.StringToWordVector;
public class MessageClustering {
private Instances instances=null;
private StringToWordVector filter=new StringToWordVector();
private Clusterer clusterer=null;
public MessageClustering(Clusterer clusterer)
{
this.clusterer=clusterer;
}
static String path="E:\\datasets\\alt.atheism\\";
public void loadInstances() throws Exception
{
String name="text";
FastVector attributes=new FastVector(1);
attributes.addElement(new Attribute("message",(FastVector)null));
instances=new Instances(name,attributes,100);
for(File file : new File(path).listFiles())
{
String message=getAllMessage(file);
Instance instance=new Instance(1);
Attribute attribute=instances.attribute("message");
instance.setValue(attribute, attribute.addStringValue(message));
instance.setDataset(instances);
instances.add(instance);
}
filter.setInputFormat(instances);
Instances filtedData=Filter.useFilter(instances,filter);
instances=filtedData;
}
public void testCluster() throws Exception
{
clusterer.buildClusterer(instances);
for (int i = 0; i < instances.numInstances(); i++) {
int cluster = clusterer.clusterInstance(instances.instance(i));
System.out.println("\t"+(i+1)+":"+cluster);
}
System.out.println(clusterer.numberOfClusters());
// System.out.println(clusterer.toString());
}
private String getAllMessage(File file) {
StringBuilder sb=new StringBuilder();
try
{
BufferedReader br=new BufferedReader(new FileReader(file));
String line;
while(true)
{
if((line=br.readLine())==null) break;
sb.append(line.trim());
}
br.close();
} catch (Exception e){}
return sb.toString();
}
public static void main(String[] args) throws Exception {
SimpleKMeans cluster=new SimpleKMeans();//构造聚类算法
cluster.setNumClusters(5);
MessageClustering sk=new MessageClustering(cluster);
sk.loadInstances();
sk.testCluster();//测试聚类效果
}
}
浙公网安备 33010602011771号