修改Damerau-Levenshtein算法以跟踪转换(插入,删除等)

时间:2017-06-19 21:53:37

标签: python algorithm damerau-levenshtein

我想知道如何修改Damerau-Levenshtein算法来跟踪将源字符串更改为目标字符串所需的特定字符转换。 Levenshtein distance已经回答了这个问题,但我找不到DL距离的任何答案。

我查看了py-Levenshtein模块:它提供了我需要的内容,但是对于Levenshtein距离:

Levenshtein.editops("FBBDE", "BCDASD")
[('delete', 0, 0), ('replace', 2, 1), ('insert', 4, 3), ('insert', 4, 
4), ('replace', 4, 5)]

editops的代码难以解密,因为它是用C语言编写的。我想知道跟踪转换是如何有效地完成的:我想这可能来自距离矩阵,它看起来像:

      r  e  p  u  b  l  i  c  a  n
   0  1  2  3  4  5  6  7  8  9  10
d  1  1  2  3  4  5  6  7  8  9  10
e  2  2  1  2  3  4  5  6  7  8  9
m  3  3  2  2  3  4  5  6  7  8  9
o  4  4  3  3  3  4  5  6  7  8  9
c  5  5  4  4  4  4  5  6  6  7  8
r  6  5  5  5  5  5  5  6  7  7  8
a  7  6  6  6  6  6  6  6  7  7  8
t  8  7  7  7  7  7  7  7  7  8  8

1 个答案:

答案 0 :(得分:3)

import numpy as np

def levenshtein_distance(string1, string2):
    n1 = len(string1)
    n2 = len(string2)
    return _levenshtein_distance_matrix(string1, string2)[n1, n2]

def damerau_levenshtein_distance(string1, string2):
    n1 = len(string1)
    n2 = len(string2)
    return _levenshtein_distance_matrix(string1, string2, True)[n1, n2]

def get_ops(string1, string2, is_damerau=False):
    i, j = _levenshtein_distance_matrix(string1, string2, is_damerau).shape
    i -= 1
    j -= 1
    ops = list()
    while i != -1 and j != -1:
        if is_damerau:
            if i > 1 and j > 1 and string1[i-1] == string2[j-2] and string1[i-2] == string2[j-1]:
                if dist_matrix[i-2, j-2] < dist_matrix[i, j]:
                    ops.insert(0, ('transpose', i - 1, i - 2))
                    i -= 2
                    j -= 2
                    continue
        index = np.argmin([dist_matrix[i-1, j-1], dist_matrix[i, j-1], dist_matrix[i-1, j]])
        if index == 0:
            if dist_matrix[i, j] > dist_matrix[i-1, j-1]:
                ops.insert(0, ('replace', i - 1, j - 1))
            i -= 1
            j -= 1
        elif index == 1:
            ops.insert(0, ('insert', i - 1, j - 1))
            j -= 1
        elif index == 2:
            ops.insert(0, ('delete', i - 1, i - 1))
            i -= 1
    return ops

def execute_ops(ops, string1, string2):
    strings = [string1]
    string = list(string1)
    shift = 0
    for op in ops:
        i, j = op[1], op[2]
        if op[0] == 'delete':
            del string[i + shift]
            shift -= 1
        elif op[0] == 'insert':
            string.insert(i + shift + 1, string2[j])
            shift += 1
        elif op[0] == 'replace':
            string[i + shift] = string2[j]
        elif op[0] == 'transpose':
            string[i + shift], string[j + shift] = string[j + shift], string[i + shift]
        strings.append(''.join(string))
    return strings

def _levenshtein_distance_matrix(string1, string2, is_damerau=False):
    n1 = len(string1)
    n2 = len(string2)
    d = np.zeros((n1 + 1, n2 + 1), dtype=int)
    for i in range(n1 + 1):
        d[i, 0] = i
    for j in range(n2 + 1):
        d[0, j] = j
    for i in range(n1):
        for j in range(n2):
            if string1[i] == string2[j]:
                cost = 0
            else:
                cost = 1
            d[i+1, j+1] = min(d[i, j+1] + 1, # insert
                              d[i+1, j] + 1, # delete
                              d[i, j] + cost) # replace
            if is_damerau:
                if i > 0 and j > 0 and string1[i] == string2[j-1] and string1[i-1] == string2[j]:
                    d[i+1, j+1] = min(d[i+1, j+1], d[i-1, j-1] + cost) # transpose
    return d

if __name__ == "__main__":
    # GIFTS PROFIT
    # FBBDE BCDASD
    # SPARTAN PART
    # PLASMA ALTRUISM
    # REPUBLICAN DEMOCRAT
    # PLASMA PLASMA
    # FISH IFSH
    # STAES STATES
    string1 = 'FISH'
    string2 = 'IFSH'
    for is_damerau in [True, False]:
        if is_damerau:
            print('=== damerau_levenshtein_distance ===')
        else:
            print('=== levenshtein_distance ===')
        dist_matrix = _levenshtein_distance_matrix(string1, string2, is_damerau=is_damerau)
        print(dist_matrix)
        ops = get_ops(string1, string2, is_damerau=is_damerau)
        print(ops)
        res = execute_ops(ops, string1, string2)
        print(res)

输出:

=== damerau_levenshtein_distance ===
[[0 1 2 3 4]
 [1 1 1 2 3]
 [2 1 1 2 3]
 [3 2 2 1 2]
 [4 3 3 2 1]]
[('transpose', 1, 0)]
['FISH', 'IFSH']
=== levenshtein_distance ===
[[0 1 2 3 4]
 [1 1 1 2 3]
 [2 1 2 2 3]
 [3 2 2 2 3]
 [4 3 3 3 2]]
[('replace', 0, 0), ('replace', 1, 1)]
['FISH', 'IISH', 'IFSH']