提取两段文本中所有词(事先需用空格分割好)
计算每个词在两段文本中分别出现的次数(用BSD tree.h里的RBTREE保存)
用词频做为A、B的分量
使用余弦公式计算AB夹角的余弦值

jaccard：

大体差不多

  1 #!/usr/bin/env python  2 #coding=utf-8  3  import psyco  4 psyco.full()  5 import math  6 from mmseg.search import seg_txt_2_dict  7 # --- top-level functions ---  8 def measure_similarity(file_a, file_b, sim_func = None):  9     ''' 10     Returns the textual similarity of file_a and file_b using chosen similarity metric 11     'sim_func' defaults to cosine_sim if not specified 12     Consumes file_a and file_b 13     ''' 14     if sim_func == None: sim_func = cosine_sim  # default to cosine_sim 15      16     u = term_vec(file_a) 17     v = term_vec(file_b) 18      19     return sim_func(u, v) 20  21      22 def cosine_sim(u, v): 23     ''' 24     Returns the cosine similarity of u,v: <u,v>/(|u||v|) 25     where |u| is the L2 norm 26     ''' 27     div = (l2_norm(u) * l2_norm(v)) 28     if div == 0: 29         return 0 30     return dot_product(u, v) / (l2_norm(u) * l2_norm(v)) 31  32 def jaccard_sim(A, B): 33     r''' 34     Returns the Jaccard similarity of A,B: |A \cap B| / |A \cup B| 35     We treat A and B as multi-sets (The Jaccard coefficient is technically defined over sets) 36     ''' 37     div = mag_union(A, B) 38     if div == 0: 39         return 0 40     else: 41         return mag_intersect(A, B) / div 42  43 # --- Term-vector operations --- 44  45 def dot_product(v1, v2): 46     '''Returns dot product of two term vectors''' 47     val = 0.0 48     for term in v1: 49         if term in v2: val += v1[term] * v2[term] 50     return val 51  52 def l2_norm(v): 53     '''Returns L2 norm of term vector v''' 54     val = 0.0 55     for term in v: 56         val += v[term]**2 57     val = math.sqrt(val) 58     return val 59  60 def mag_union(A, B): 61     ''' 62     Returns magnitude of multiset-union of A and B 63     ''' 64     val = 0 65     for term in A: val += A[term] 66     for term in B: val += B[term] 67     return val 68  69 def mag_intersect(A, B): 70     ''' 71     Returns magnitude of multiset-intersection of A and B 72     ''' 73     val = 0 74     for term in A: 75         if term in B: val += min(A[term], B[term]) 76     return val 77  78 # another name for l2_norm() 79  80 # --- Utilities for creating term vectors from data --- 81 def term_vec(f): 82     '''Returns a term vector for 'file', represented as a dictionary mapping {term->frequency}''' 83      84     return seg_txt_2_dict(f) 85  86 # --- Exceptions --- 87 class Error(Exception): 88     '''Base class for Exception types used in this module''' 89     pass 90  91 class FileFormatException(Exception): 92     pass