谁能发现我的Damerau-Levenshtein Distance实施中的错误?

时间:2010-09-29 02:48:29

标签: c#

我有一个时髦的错误让我疯了。任何人都可以帮我找到它吗?尝试使用两个单词来调用该函数,这两个单词的区别仅在于缺少最后一个字符(“garble”vs“garbl”)。该函数返回0而不是预期的1.它应该返回1,对吗?

我试过摆弄数组边界,但这只会导致IndexOutOfRangeExceptions

public static class FuzzyStringMatcher
{
    private const int DELETION = 0;
    private const int INSERTION = 1;
    private const int SUBSTITUTION = 2;
    private const int TRANSPOSITION = 3;

    private const int COST_OF_DELETION = 1;
    private const int COST_OF_INSERTION = 1;
    private const int COST_OF_TRANSPOSITION = 1;
    private const int COST_OF_SUBSTITUTION = 1;

    public static int Compute_DamerauLevenshtein_Distance(string a, string b)
    {
        int[,] rows = new int[a.Length + 1, b.Length + 1];
        int cost_ratio;
        int[] calculations = new int[4];

        //
        // Init the array
        //
        for (int i = 0; i < rows.GetUpperBound(0); i++)
            rows[i, 0] = i;

        for (int i = 0; i < rows.GetUpperBound(1); i++)
            rows[0, i] = i;


        for (int aidx = 1; aidx < rows.GetUpperBound(0); aidx++)
        {
            for (int bidx = 1; bidx < rows.GetUpperBound(1); bidx++)
            {
                if (a[aidx - 1] == b[bidx - 1])
                    cost_ratio = 0;
                else
                    cost_ratio = 1;

                calculations[DELETION] = rows[aidx - 1, bidx] + COST_OF_DELETION;
                calculations[INSERTION] = rows[aidx, bidx - 1] + COST_OF_INSERTION;
                calculations[SUBSTITUTION] = rows[aidx - 1, bidx - 1] + cost_ratio * COST_OF_SUBSTITUTION;
                calculations[TRANSPOSITION] = int.MaxValue;

                if (aidx > 1 && bidx > 1 && a[aidx] == b[bidx - 1] && a[aidx - 1] == b[bidx])
                    calculations[TRANSPOSITION] = rows[aidx - 2, bidx - 2] + cost_ratio * COST_OF_TRANSPOSITION;

                rows[aidx, bidx] = calculations.Min();
            }
        }

        int score = rows[rows.GetUpperBound(0) - 1, rows.GetUpperBound(1) - 1];
        if (a.Contains(b) || b.Contains(a))
            score = score / 2;
        return score;
    }
}

我的实施基于Damerau-Levenshtein-Distance

上维基百科页面中给出的算法

2 个答案:

答案 0 :(得分:3)

+1 Lou Franco。但除此之外,您似乎有很多索引问题(请注意,wiki示例中的所有4个for周期都包含在内,当从aidx / bidx中减去1时,您实际上需要减去2,因为在wiki示例索引中字符串从1开始。我的版本:

    public static int Compute_DamerauLevenshtein_Distance2(string a, string b)
    {
        int[,] rows = new int[a.Length + 1, b.Length + 1];
        int cost_ratio;
        int[] calculations = new int[4];

        for(int i = 0; i <= rows.GetUpperBound(0); i++)
            rows[i, 0] = i;

        for(int i = 1; i <= rows.GetUpperBound(1); i++)
            rows[0, i] = i;


        for(int aidx = 1; aidx <= rows.GetUpperBound(0); aidx++)
        {
            for(int bidx = 1; bidx <= rows.GetUpperBound(1); bidx++)
            {
                if(a[aidx - 1] == b[bidx - 1])
                    cost_ratio = 0;
                else
                    cost_ratio = 1;

                calculations[DELETION] = rows[aidx - 1, bidx] + COST_OF_DELETION;
                calculations[INSERTION] = rows[aidx, bidx - 1] + COST_OF_INSERTION;
                calculations[SUBSTITUTION] = rows[aidx - 1, bidx - 1] + cost_ratio * COST_OF_SUBSTITUTION;
                calculations[TRANSPOSITION] = int.MaxValue;

                if(aidx > 1 && bidx > 1 && a[aidx - 1] == b[bidx - 2] && a[aidx - 2] == b[bidx - 1])
                    calculations[TRANSPOSITION] = rows[aidx - 2, bidx - 2] + cost_ratio * COST_OF_TRANSPOSITION;

                rows[aidx, bidx] = calculations.Min();
            }
        }

        int score = rows[rows.GetUpperBound(0), rows.GetUpperBound(1)];
        return score;
    }

答案 1 :(得分:2)

这不在维基百科的文章中:

if (a.Contains(b) || b.Contains(a))
        score = score / 2;

因为你的例子也是如此 - 并且整数除以1/2 == 0,那么就可以了。