余弦相似度判断
1.两个文件相似度比较
文本文件
1)先切词,编码把文本向量化处理
2)使用余弦定理计算
二进制文件
读取文件数值,直接判断
#!/usr/bin/python3
# -*- coding: UTF-8 -*-
import sys
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
with open(sys.argv[1], 'rb') as f1:
pcm1 = np.array([int(_) for _ in f1.read()])
with open(sys.argv[2], 'rb') as f2:
pcm2 = np.array([int(_) for _ in f2.read()])
if np.size(pcm1) > np.size(pcm2):
delta = np.size(pcm1) - np.size(pcm2)
pcm2 = np.pad(pcm2, ((0, delta)), mode='constant')
elif np.size(pcm1) < np.size(pcm2):
delta = np.size(pcm2) - np.size(pcm1)
pcm1 = np.pad(pcm1, ((0, delta)), mode='constant')
else:
delta = np.array(0)
sim = cosine_similarity(np.expand_dims(pcm1, axis=0), np.expand_dims(pcm2, axis=0))
print('两个pcm相似度:', sim[0][0])

浙公网安备 33010602011771号