文章相似度算法-余弦定理

  其实关于余弦定理的文章有很多,我也看了很多,但是我最喜欢的一篇,我个人认为最通俗易懂的一篇是这个:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html。现在推荐给大家,解除了我很多疑惑,大家跟我一样对向量,对计算感到疑惑的,可以移步过去看看。我在这里就贴下代码吧

  

private static void CalculateTF(List<String> docs,
            Hashtable<String, TextStat> terms,
            ArrayList<Hashtable<Integer, Double>> tfs) {
        for (String doc : docs) {
            Hashtable<Integer, Integer> termNums = new Hashtable<Integer, Integer>();
            String result = Net.DownLoad(
                    "http://xxxxxx/Server/SegService", doc);// 部署的一个分词服务。
            String[] words = result.split("\\|");
            ArrayList wordList = new ArrayList();
            for (String word : words) {
                if (!Config.StopWordList.containsKey(word)) {
                    wordList.add(word);
                    int index = -1;
                    if (!terms.containsKey(word)) {
                        index = terms.size();
                        terms.put(word, new TextStat(index));
                    } else {
                        index = terms.get(word).index;
                    }
                    if (!termNums.containsKey(index)) {
                        termNums.put(index, 1);
                        terms.get(word).docNum++; // 词的文档数
                    } else {
                        int temp = termNums.get(index);
                        temp++;
                        termNums.put(index, temp);
                    }
                }
            }
            double len = (double) wordList.size();
            Hashtable<Integer, Double> tf = new Hashtable<Integer, Double>(); // 词频
            for (int key : termNums.keySet()) {
                int value = termNums.get(key);
                tf.put(key, (double) value / len); // 当前词的词数/总词数
            }
            tfs.add(tf);
        }
    }

    private static void CalculateIDF(List<String> docs,
            Hashtable<String, TextStat> terms, Hashtable<Integer, Double> idfs) {
        double len = (double) docs.size();
        for (String key : terms.keySet()) {
            TextStat textStat = terms.get(key);
            double idf = log(len / (double) textStat.docNum, Math.E); // ln(总文档数/当前词出现过的文档数)
            idfs.put(textStat.index, idf);
        }
    }

    private static Hashtable<Integer, String> Deduplication(
            ArrayList<Hashtable<Integer, Double>> tfidfs,
            ArrayList<String> resultList, Hashtable<Integer, String> resultHash) {
        for (int i = 0; i < tfidfs.size(); i++) {
            if (!resultHash.contains(i)) {
                for (int j = i + 1; j < tfidfs.size(); j++) {
                    Hashtable<Integer, Double> tfidf = tfidfs.get(i);
                    Hashtable<Integer, Double> temp = tfidfs.get(j);

                    double dotProduct = CalcDotProduct(tfidf, temp);
                    double length1 = CalcLength(tfidf);
                    double length2 = CalcLength(temp);
                    double cosine = dotProduct / (length1 * length2);

                    if (cosine > 0.5) {
                        tfidfs.remove(j);
                        resultList.remove(j);
                        Deduplication(tfidfs, resultList, resultHash);
                    } else {
                        if (!resultHash.containsKey(i)) {
                            resultHash.put(i, resultList.get(i));
                            resultHash.put(j, resultList.get(j));
                        }
                        if (i + 1 == tfidfs.size()) {
                            resultHash.put(i + 1, resultList.get(i + 1));
                        }
                    }
                }
            }
        }
        return resultHash;
    }

    private static void CalculateTFIDF(
            ArrayList<Hashtable<Integer, Double>> tfs,
            Hashtable<Integer, Double> idfs,
            ArrayList<Hashtable<Integer, Double>> tfidfs) {
        for (Hashtable<Integer, Double> tf : tfs) {
            Hashtable<Integer, Double> tfidf = new Hashtable<Integer, Double>();
            for (int key : tf.keySet()) {
                Double value = tf.get(key);
                tfidf.put(key, value * idfs.get(key));
            }
            tfidfs.add(tfidf);
        }
    }

    static public double log(double value, double base) {
        return Math.log(value) / Math.log(base);
    }

    // / <summary>
    // / 计算向量长度(vector length)
    // / </summary>
    // / <param name="vector"></param>
    // / <returns></returns>
    private static double CalcLength(Hashtable<Integer, Double> vector) {
        double length = 0;
        for (int key : vector.keySet()) {
            length += Math.pow(vector.get(key), 2);
        }
        return Math.sqrt(length);
    }

    // / <summary>
    // / 计算向量点积(dot product)/内积(inner product)
    // / </summary>
    // / <param name="vector1"></param>
    // / <param name="vector2"></param>
    // / <returns></returns>
    private static double CalcDotProduct(Hashtable<Integer, Double> vector1,
            Hashtable<Integer, Double> vector2) {
        double dotProduct = 0;
        for (int key : vector1.keySet()) {
            double value = vector1.get(key);
            if (vector2.containsKey(key)) {
                dotProduct += value * vector2.get(key);
            }
        }
        return dotProduct;
    }
View Code

 

 

 

posted @ 2014-05-18 16:09  must Do  阅读(345)  评论(0)    收藏  举报