尝试用java实现k-means，老鸟多指点，菜鸟共同学习啊

1.首先将停用词读入ArrayList中：

代码

public ArrayList<String> stopword() {
        ArrayList<String> myList1 = new ArrayList<String>();
        try {
            File myFile = new File("stopWord.txt");
            FileReader filereader = new FileReader(myFile);
            BufferedReader reader = new BufferedReader(filereader);
            String line = null;
            while ((line = reader.readLine()) != null) {
                myList1.add(line);
            }
            reader.close();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return myList1;
    }

2.将文章中停用词去掉，剩下待统计tfidf的term：

代码

public ArrayList<String> delete(String filename) {
        ArrayList<String> myList = new ArrayList<String>();
        ArrayList<String> myList1 = stopword();
        String token;
        try {
            File myFile = new File(filename);
            FileReader filereader = new FileReader(myFile);
            BufferedReader reader = new BufferedReader(filereader);
            while ((token = reader.readLine()) != null) {
                if (!myList1.contains(token))
                    myList.add(token);
            }
            reader.close();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return myList;
    }

3.计算每个词的tfidf值：

代码

public HashMap<String, Double> showtfidf(ArrayList<String> filList) {
        ArrayList<Double> tfList = new ArrayList<Double>();
        ArrayList<Double> myList2 = new ArrayList<Double>();
        HashMap<String, Integer> fMap = new HashMap<String, Integer>();
        HashMap<String, Double> tfidfMap = new HashMap<String, Double>();
        HashMap<String, Double> tfidftopMap = new HashMap<String, Double>();
        for (String t : filList) {
            if (fMap.containsKey(t)) {
                fMap.put(t, (fMap.get(t) + 1));
            } else {
                fMap.put(t, 1);
            }
        }
        for (String d : fMap.keySet()) {
            double tf = (double) fMap.get(d) / filList.size();
            double idf = showidf(d);
            double tfidf = tf + idf;
            tfidfMap.put(d, tfidf);
            tfList.add(tfidf);
        }
        Collections.sort(tfList);
        for (int i = tfList.size() - 1; i > tfList.size() - 3; i--) {
            myList2.add(tfList.get(i));
        }
        for (String r : tfidfMap.keySet()) {
            if (myList2.contains(tfidfMap.get(r))) {
                tfidftopMap.put(r, tfidfMap.get(r));
            }
        }
        return tfidftopMap;
    }

    public double showidf(String d) {
        Filter f = new Filter();
        ArrayList<ArrayList> all = f.showall();
        int i = 0;
        double idf = 0;
        for (ArrayList a : all) {
            if (a.contains(d)) {
                i++;
                idf = (double) all.size() / i;
            }
        }
        return idf;
    }

待续--

posted on 2011-01-29 21:13 丁啸阅读(469) 评论(3) 收藏举报