C#实现字符串相似度比较[Levenshtein Distance算法]

字符串相似度算法使用 Levenshtein Distance算法(中文翻译：编辑距离算法) 这算法是由俄国科学家Levenshtein提出的.

下面使用C#实现

public class LevenshteinDistance
    {

        private static LevenshteinDistance _instance=null;
        public static LevenshteinDistance Instance
        {
            get
            {
                if (_instance == null)
                {
                    return new LevenshteinDistance();
                }
                return _instance;
            }
        }


        /// <summary>
        /// 取最小的一位数
        /// </summary>
        /// <param name="first"></param>
        /// <param name="second"></param>
        /// <param name="third"></param>
        /// <returns></returns>
        public int LowerOfThree(int first, int second, int third)
        {
            int min = first;
            if (second < min)
                min = second;
            if (third < min)
                min = third;
            return min;
        }

        public int Levenshtein_Distance(string str1, string str2)
        {
            int[,] Matrix;
            int n=str1.Length;
            int m=str2.Length;

            int temp = 0;
            char ch1;
            char ch2;
            int i = 0;
            int j = 0;
            if (n ==0)
            {
                return m;
            }
            if (m == 0)
            {

                return n;
            }
            Matrix=new int[n+1,m+1];

            for (i = 0; i <= n; i++)
            {
                //初始化第一列
                Matrix[i,0] = i;
            }

            for (j = 0; j <= m; j++)
            {
                //初始化第一行
                Matrix[0, j] = j;
            }

            for (i = 1; i <= n; i++)
            {
                ch1 = str1[i-1];
                for (j = 1; j <= m; j++)
                {
                    ch2 = str2[j-1];
                    if (ch1.Equals(ch2))
                    {
                        temp = 0;
                    }
                    else
                    {
                        temp = 1;
                    }
                    Matrix[i,j] = LowerOfThree(Matrix[i - 1,j] + 1, Matrix[i,j - 1] + 1, Matrix[i - 1,j - 1] + temp);

                }
            }

            for (i = 0; i <= n; i++)
            {
                for (j = 0; j <= m; j++)
                {
                    Console.Write(" {0} ", Matrix[i, j]);
                }
                Console.WriteLine("");
            }
            return Matrix[n, m];

        }

        /// <summary>
        /// 计算字符串相似度
        /// </summary>
        /// <param name="str1"></param>
        /// <param name="str2"></param>
        /// <returns></returns>
        public decimal LevenshteinDistancePercent(string str1,string str2)
        {
            int maxLenth = str1.Length > str2.Length ? str1.Length : str2.Length;
            int val = Levenshtein_Distance(str1, str2);
            return 1 - (decimal)val / maxLenth;
        }
    }

    class Program
    {

        static void Main(string[] args)
        {
            string str1 = "你好蒂蒂";
            string str2="你好蒂芬";
            Console.WriteLine("字符串1 {0}", str1);

            Console.WriteLine("字符串2 {0}", str2);

            Console.WriteLine("相似度 {0} %", LevenshteinDistance.Instance.LevenshteinDistancePercent(str1, str2)*100);
            Console.ReadLine();
        }
    }

我们在做数据系统的时候，经常会用到模糊搜索，但是，数据库提供的模糊搜索并不具备按照相关度进行排序的功能。
现在提供一个比较两个字符串相似度的方法。
通过计算出两个字符串的相似度，就可以通过Linq在内存中对数据进行排序和筛选，选出和目标字符串最相似的一个结果。

本次所用到的相似度计算公式是相似度=Kq*q/(Kq*q+Kr*r+Ks*s) (Kq > 0 , Kr>=0,Ka>=0)
其中，q是字符串1和字符串2中都存在的单词的总数，s是字符串1中存在，字符串2中不存在的单词总数，r是字符串2中存在，字符串1中不存在的单词总数. Kq,Kr和ka分别是q,r,s的权重，根据实际的计算情况，我们设Kq=2，Kr=Ks=1.
根据这个相似度计算公式，得出以下程序代码：
/// <summary>
/// 获取两个字符串的相似度
/// </summary>
/// <param name=”sourceString”>第一个字符串</param>
/// <param name=”str”>第二个字符串</param>
/// <returns></returns>
public static decimal GetSimilarityWith(this string sourceString, string str)
{

decimal Kq = 2;
decimal Kr = 1;
decimal Ks = 1;

char[] ss = sourceString.ToCharArray();
char[] st = str.ToCharArray();

//获取交集数量
int q = ss.Intersect(st).Count();
int s = ss.Length – q;
int r = st.Length – q;

return Kq * q / (Kq * q + Kr * r + Ks * s);
}

字符串相似度算法（ Levenshtein Distance算法）

昨天论坛看到的，简单写了一下
题目：一个字符串可以通过增加一个字符，删除一个字符，替换一个字符得到另外一个字符串，假设，我们把从字符串A转换成字符串B，前面3种操作所执行的最少次数称为AB相似度
如  abc adc  度为 1
   ababababa babababab 度为 2
   abcd acdb 度为2

字符串相似度算法可以使用 Levenshtein Distance算法(中文翻译：编辑距离算法) 这算法是由俄国科学家Levenshtein提出的。其步骤

Step	Description
1	Set n to be the length of s. Set m to be the length of t. If n = 0, return m and exit. If m = 0, return n and exit. Construct a matrix containing 0..m rows and 0..n columns.
2	Initialize the first row to 0..n. Initialize the first column to 0..m.
3	Examine each character of s (i from 1 to n).
4	Examine each character of t (j from 1 to m).
5	If s[i] equals t[j], the cost is 0. If s[i] doesn't equal t[j], the cost is 1.
6	Set cell d[i,j] of the matrix equal to the minimum of: a. The cell immediately above plus 1: d[i-1,j] + 1. b. The cell immediately to the left plus 1: d[i,j-1] + 1. c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost.
7	After the iteration steps (3, 4, 5, 6) are complete, the distance is found in cell d[n,m].

C++实现如下

#include <iostream>

#include <vector>

#include <string>

using namespace std;

//算法

int ldistance(const string source,const string target)

{

//step 1

int n=source.length();

int m=target.length();

if (m==0) return n;

if (n==0) return m;

//Construct a matrix

typedef vector< vector<int> > Tmatrix;

Tmatrix matrix(n+1);

for(int i=0; i<=n; i++) matrix[i].resize(m+1);

//step 2 Initialize

for(int i=1;i<=n;i++) matrix[i][0]=i;

for(int i=1;i<=m;i++) matrix[0][i]=i;

//step 3

for(int i=1;i<=n;i++)

{

const char si=source[i-1];

//step 4

for(int j=1;j<=m;j++)

{

const char dj=target[j-1];

//step 5

int cost;

if(si==dj){

cost=0;

}

else{

cost=1;

}

//step 6

const int above=matrix[i-1][j]+1;

const int left=matrix[i][j-1]+1;

const int diag=matrix[i-1][j-1]+cost;

matrix[i][j]=min(above,min(left,diag));

}

}//step7

return matrix[n][m];

}

int main(){

string s;

string d;

cout<<"source=";

cin>>s;

cout<<"diag=";

cin>>d;

int dist=ldistance(s,d);

cout<<"dist="<<dist<<endl;

}