# 编辑距离（Minimum Edit Distance）

• i == 0j == 0
• $lev(i, j) = max(i, j)$
• i，j 不为0， 且 $a[i] == a[j]$
• $lev(i, j) = lev(i-1, j-1)$
• i，j 不为0， 且 $a[i] != a[j]$
• 插入： $lev(i, j-1)$
• 删除： $lev(i-1, j)$
• 替换： $lev(i-1, j-1) + 1$
• 三者取最小

# C++ 实现

class Solution {
public:
vector<vector<int>> matrix;
int editDistance(string str1, string str2) {
int len1 = str1.size();
int len2 = str2.size();

matrix = vector<vector<int>>(len1 + 1, vector<int>(len2 + 1, 0));

for (int i = 0; i <= len1; ++i)
matrix[i][0] = i;

for (int j = 1; j <= len2; ++j)
matrix[0][j] = j;

for (int i = 1; i <= len1; ++i) {
for (int j = 1; j <= len2; ++j) {
if (str1[i] == str2[j])		matrix[i][j] = matrix[i - 1][j - 1];	// 最后一个字母相同
else {
int insert = matrix[i][j - 1] + 1;		// 插入
int del = matrix[i - 1][j] + 1;			// 删除
int replace = matrix[i - 1][j - 1] + 1;	// 替换
matrix[i][j] = min(insert, min(del, replace));
}
}
}
return matrix[len1][len2];
}
};


# python-Levenshtein 库

pip install python-Levenshtein

import Levenshtein
print(Levenshtein.distance("the","teh"))


# 单词纠错

$p(c | w)=\frac{p(c) * p(w | c)}{p(w)}$

• $p(w|c)$ : 计算通过w所有编辑距离为i的所有正确单词(就是c)，i越小表示p越大
• $p(c)$ : 在上面的c中，找出频率最高的单词
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

def P(word, N=sum(WORDS.values())):
# 返回单词的概率
return WORDS[word] / N

def correction(word):
# 找到频率最高的c
return max(candidates(word), key=P)

def candidates(word):
'''
找到候选单词c
'''
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words):
# 返回 words 和 WORDS的交集. 找出正确单词
return set(w for w in words if w in WORDS)

def edits1(word):
# 编辑距离为1的所有单词
"All edits that are one edit away from word."
letters    = 'abcdefghijklmnopqrstuvwxyz'
splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
deletes    = [L + R[1:]               for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
inserts    = [L + c + R               for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)

def edits2(word):
# 编辑距离为2的所有单词
return (e2 for e1 in edits1(word) for e2 in edits1(e1))


• 如果list1不为空，那么返回list1
• 如果list1为空，那么返回list2

return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

• 如果单词是正确的就直接返回
• 如果但是错的，就返回编辑距离是1的所有正确单词作为候选词
• 如果编辑距离是1的正确单词没有，就返回编辑距离是2的所有正确单词作为候选词
• 如果还是为空，就返回他自己
posted @ 2020-07-27 11:24  twilight0402  阅读(31)  评论(0编辑  收藏