【余弦定理】中文文本相似度计算
相似度计算
# 输入A,B两段语句,判断相似度
import jieba
from collections import Counter
def preprocess_data(text):
    """数据预处理函数,分词并去除停用词"""
    # 使用结巴分词对文本进行分词
    words = jieba.cut(text)
    # 去除停用词,这里只列举了几个示例停用词,实际应用中需要根据具体需求添加更多停用词
    stopwords = ['的', '了', '和', '是', '就', '而', '及', '与', '或']
    filtered_words = [word for word in words if word not in stopwords]
    return filtered_words
def extract_features(words):
    """特征提取函数,使用词袋模型"""
    features = Counter(words)
    return str(features)
def cosine_similarity(features1, features2):
    """余弦相似度计算函数"""
    numerator = sum(features1[word] * features2[word] for word in set(features1) & set(features2))
    denominator = ((sum(features1[word] ** 2 for word in features1) ** 0.5) * (
            sum(features2[word] ** 2 for word in features2) ** 0.5))
    if not denominator:
        return 0.0
    else:
        return round(numerator / float(denominator), 3)
def check_duplicate(content, input_text, threshold=0.7):
    """查重函数,判断当前文本是否与已有文本重复"""
    # 对当前文本进行预处理和特征提取
    words = preprocess_data(content)
    features = extract_features(words)
    
    # 在此模拟已有文本的特征
    existing_features = extract_features(preprocess_data(input_text))
    
    similarity = cosine_similarity(eval(features), eval(existing_features))
    
    # 根据设定的相似度阈值来判断是否重复
    if similarity >= threshold:
        return similarity
    else:
        return similarity
similarity = check_duplicate("我是你的人","我是你的情人")
print('similarity',similarity)
    Python全栈(后端、数据分析、脚本、爬虫、EXE客户端) / 前端(WEB,移动,H5) / Linux / SpringBoot / 机器学习
                    
                
                
            
        
浙公网安备 33010602011771号