获取Levenshtein距离矩阵

时间:2014-07-16 14:06:06

标签: java string

我编写了一些实现Levensthein距离的java代码来计算两个字符串的相似度。我的代码是这样的:

public class LevenshteinDistance {

public LevenshteinDistance() {

}

public double similarity(String s1, String s2) {
    if (s1.length() < s2.length()) { // s1 should always be bigger
        String swap = s1;
        s1 = s2;
        s2 = swap;
    }
    int bigLen = s1.length();
    if (bigLen == 0) {
        return 1.0; /* both strings are zero length */ }
    return (bigLen - computeEditDistance(s1, s2)) / (double) bigLen;
}

public int computeEditDistance(String s1, String s2) {
    s1 = s1.toLowerCase();
    s2 = s2.toLowerCase();

    int[] costs = new int[s2.length() + 1];

    for (int i = 0; i <= s1.length(); i++) {
        int lastValue = i;

        for (int j = 0; j <= s2.length(); j++) {
            if (i == 0) {
                costs[j] = j;
            } else {

                if (j > 0) {
                    int newValue = costs[j - 1];
                    if (s1.charAt(i - 1) != s2.charAt(j - 1)) {
                        newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;

                    }

                    costs[j - 1] = lastValue;
                    lastValue = newValue;
                }
            }
        }
        if (i > 0) {
            costs[s2.length()] = lastValue;
        }
    }

    return costs[s2.length()];

}

public double printDistance(String s1, String s2) {

    System.out.println("[Edit Distance]      : " + s1 + " and " + s2 + " have similarity is = " + similarity(s1, s2) * 100 + " %");
    return similarity(s1, s2) * 100;
}

public static void main(String[] args) {

    LevenshteinDistance lv = new LevenshteinDistance();
    lv.printDistance("164164617044", "164164617044");

}

}

从我上面的代码中,我有一个这样的输出:

[Edit Distance]      : 164164617044 and 164164617044 have similarity is = 100.0 %

如何获得代表Levensthein距离的matriks输出

        1   6   4   1   6   4   6   1   7   0   4   4
    0   1   2   3   4   5   6   7   8   9   10  11  12
1   1   0   1   2   3   4   5   6   7   8   9   10  11
6   2   1   0   1   2   3   4   5   6   7   8   9   10
4   3   2   1   0   1   2   3   4   5   6   7   8   9
1   4   3   2   1   0   1   2   3   4   5   6   7   8
6   5   4   3   2   1   0   1   2   3   4   5   6   7
4   6   5   4   3   2   1   0   1   2   3   4   5   6
6   7   6   5   4   3   2   1   0   1   2   3   4   5
1   8   7   6   5   4   3   2   1   0   1   2   3   4
7   9   8   7   6   5   4   3   2   1   0   1   2   3
0   10  9   8   7   6   5   4   3   2   1   0   1   2
4   11  10  9   8   7   6   5   4   3   2   1   0   1
4   12  11  10  9   8   7   6   5   4   3   2   1   0

寻求帮助,谢谢

1 个答案:

答案 0 :(得分:0)

问题是你的实现只计算距离而不是结果矩阵。

此解决方案基于 Wagner–Fischer algorithm

public class LevenshteinDistance {

    public static void main(String[] args) {
        String str1 = "164164617044", str2 = "164164617044";

        System.out.print("String 1: \"" + str1 + "\" String 2: \"" + str2
                + "\".\n\n");

        int[][] resultingMatrix = resultingMatrix(str1, str2);
        int distance = resultingMatrix[str1.length()][str2.length()];
        double similarity = similarity(str1, str2, distance), similarityPercentage = similarity * 100.0;

        System.out.print("Levenshtein distance = " + distance + ".\n\n");
        System.out.print("Similarity = " + similarity + ".\n\n");
        System.out.print("Similarity Percentage = " + similarityPercentage
                + "%.\n\n");

        System.out.print("Resulting Matrix:\n");

        for (int charsInStr2 = 0; charsInStr2 < str2.length(); charsInStr2++) {
            if (charsInStr2 == 0) {
                System.out.print(String.format("%-5s%-5s%-5s", " ", " ", "'" + str2.charAt(charsInStr2) + "'"));
            } else {
                System.out
                        .print(String.format("%-5s", "'" + str2.charAt(charsInStr2)+ "'"));
            }
        }

        System.out.print("\n");

        for (int i = 0; i < resultingMatrix.length; i++) {
            String line = "";

            for (int j = 0; j < resultingMatrix[i].length; j++) {
                line += String.format("%-5s", resultingMatrix[i][j]);
            }
            if(i == 0)
                System.out.printf("%-4s %s\n", " ", line);
            else{
                System.out.printf("%-4s %s\n", "'" + str1.charAt(i-1) + "'", line);
            }
        }
    }

    public static int[][] resultingMatrix(String str1, String str2) {
        int[][] dist = new int[str1.length() + 1][str2.length() + 1];

        for (int i = 0; i <= str1.length(); i++) {
            dist[i][0] = i;
        }

        for (int j = 1; j <= str2.length(); j++) {
            dist[0][j] = j;
        }

        for (int i = 1; i <= str1.length(); i++)
            for (int j = 1; j <= str2.length(); j++)
                dist[i][j] = Math
                        .min(Math.min(dist[i - 1][j] + 1, dist[i][j - 1] + 1),
                                dist[i - 1][j - 1]
                                        + ((str1.charAt(i - 1) == str2
                                                .charAt(j - 1)) ? 0 : 1));
        return dist;
    }

    public static double similarity(String str1, String str2, int distance) {
        double bigLenght = Math.max(str1.length(), str2.length());

        return (bigLenght - distance) / bigLenght;
    }
}

示例:

String 1: "164164617044" String 2: "164164617044".

Levenshtein distance = 0.

Similarity = 1.0.

Similarity Percentage = 100.0%.

Resulting Matrix:
          '1'  '6'  '4'  '1'  '6'  '4'  '6'  '1'  '7'  '0'  '4'  '4'  
     0    1    2    3    4    5    6    7    8    9    10   11   12   
'1'  1    0    1    2    3    4    5    6    7    8    9    10   11   
'6'  2    1    0    1    2    3    4    5    6    7    8    9    10   
'4'  3    2    1    0    1    2    3    4    5    6    7    8    9    
'1'  4    3    2    1    0    1    2    3    4    5    6    7    8    
'6'  5    4    3    2    1    0    1    2    3    4    5    6    7    
'4'  6    5    4    3    2    1    0    1    2    3    4    5    6    
'6'  7    6    5    4    3    2    1    0    1    2    3    4    5    
'1'  8    7    6    5    4    3    2    1    0    1    2    3    4    
'7'  9    8    7    6    5    4    3    2    1    0    1    2    3    
'0'  10   9    8    7    6    5    4    3    2    1    0    1    2    
'4'  11   10   9    8    7    6    5    4    3    2    1    0    1    
'4'  12   11   10   9    8    7    6    5    4    3    2    1    0

请注意,Levensthein距离始终生成矩阵右下角的元素(resulMatrix [ str1Len ] [ str2Len ] )。另外,请看一下我的解决方案中计算相似度的方法。