文章相似度算法-余弦定理
其实关于余弦定理的文章有很多,我也看了很多,但是我最喜欢的一篇,我个人认为最通俗易懂的一篇是这个:http://www.ruanyifeng.com/blog/2013/03/tf-idf.html。现在推荐给大家,解除了我很多疑惑,大家跟我一样对向量,对计算感到疑惑的,可以移步过去看看。我在这里就贴下代码吧
private static void CalculateTF(List<String> docs, Hashtable<String, TextStat> terms, ArrayList<Hashtable<Integer, Double>> tfs) { for (String doc : docs) { Hashtable<Integer, Integer> termNums = new Hashtable<Integer, Integer>(); String result = Net.DownLoad( "http://xxxxxx/Server/SegService", doc);// 部署的一个分词服务。 String[] words = result.split("\\|"); ArrayList wordList = new ArrayList(); for (String word : words) { if (!Config.StopWordList.containsKey(word)) { wordList.add(word); int index = -1; if (!terms.containsKey(word)) { index = terms.size(); terms.put(word, new TextStat(index)); } else { index = terms.get(word).index; } if (!termNums.containsKey(index)) { termNums.put(index, 1); terms.get(word).docNum++; // 词的文档数 } else { int temp = termNums.get(index); temp++; termNums.put(index, temp); } } } double len = (double) wordList.size(); Hashtable<Integer, Double> tf = new Hashtable<Integer, Double>(); // 词频 for (int key : termNums.keySet()) { int value = termNums.get(key); tf.put(key, (double) value / len); // 当前词的词数/总词数 } tfs.add(tf); } } private static void CalculateIDF(List<String> docs, Hashtable<String, TextStat> terms, Hashtable<Integer, Double> idfs) { double len = (double) docs.size(); for (String key : terms.keySet()) { TextStat textStat = terms.get(key); double idf = log(len / (double) textStat.docNum, Math.E); // ln(总文档数/当前词出现过的文档数) idfs.put(textStat.index, idf); } } private static Hashtable<Integer, String> Deduplication( ArrayList<Hashtable<Integer, Double>> tfidfs, ArrayList<String> resultList, Hashtable<Integer, String> resultHash) { for (int i = 0; i < tfidfs.size(); i++) { if (!resultHash.contains(i)) { for (int j = i + 1; j < tfidfs.size(); j++) { Hashtable<Integer, Double> tfidf = tfidfs.get(i); Hashtable<Integer, Double> temp = tfidfs.get(j); double dotProduct = CalcDotProduct(tfidf, temp); double length1 = CalcLength(tfidf); double length2 = CalcLength(temp); double cosine = dotProduct / (length1 * length2); if (cosine > 0.5) { tfidfs.remove(j); resultList.remove(j); Deduplication(tfidfs, resultList, resultHash); } else { if (!resultHash.containsKey(i)) { resultHash.put(i, resultList.get(i)); resultHash.put(j, resultList.get(j)); } if (i + 1 == tfidfs.size()) { resultHash.put(i + 1, resultList.get(i + 1)); } } } } } return resultHash; } private static void CalculateTFIDF( ArrayList<Hashtable<Integer, Double>> tfs, Hashtable<Integer, Double> idfs, ArrayList<Hashtable<Integer, Double>> tfidfs) { for (Hashtable<Integer, Double> tf : tfs) { Hashtable<Integer, Double> tfidf = new Hashtable<Integer, Double>(); for (int key : tf.keySet()) { Double value = tf.get(key); tfidf.put(key, value * idfs.get(key)); } tfidfs.add(tfidf); } } static public double log(double value, double base) { return Math.log(value) / Math.log(base); } // / <summary> // / 计算向量长度(vector length) // / </summary> // / <param name="vector"></param> // / <returns></returns> private static double CalcLength(Hashtable<Integer, Double> vector) { double length = 0; for (int key : vector.keySet()) { length += Math.pow(vector.get(key), 2); } return Math.sqrt(length); } // / <summary> // / 计算向量点积(dot product)/内积(inner product) // / </summary> // / <param name="vector1"></param> // / <param name="vector2"></param> // / <returns></returns> private static double CalcDotProduct(Hashtable<Integer, Double> vector1, Hashtable<Integer, Double> vector2) { double dotProduct = 0; for (int key : vector1.keySet()) { double value = vector1.get(key); if (vector2.containsKey(key)) { dotProduct += value * vector2.get(key); } } return dotProduct; }
浙公网安备 33010602011771号