我有一个for循环,它接受用户的输入和我的字典中的一个键并将它们传递给Damerau-Levenshtein函数并根据距离覆盖用户的输入字典键(for循环用于遍历每个字典键)。这对于大于三个字符的字符串足够好,但如果字符串是三个或更少字符,则算法返回错误的键。这是for循环:
1950 For j = 0 To dict.Count - 1
1960 distance = DamerauLevenshtein(SplitStr(i), dict.Keys(j))
1970 'MsgBox dict.Keys(j) & vbCrLf & distance ' used for debugging
1980 If distance < 4 Then
1990 If distance < leastDist Then
2000 leastDist = distance
2010 SplitStr(i) = dict.Keys(j)
2020 End If
2030 End If
2040 Next
2050 MsgBox "The distance is: " & leastDist & vbCrLf & "The entered text was " & tempStr & vbCrLf & "The replaced word is " & SplitStr(i)
SplitStr(i)保存用户的输入,该输入来自分割功能。我任意选择了4个好距离
我从a bytes.com forum post偷走了算法。算法如下:
Function DamerauLevenshtein(str1, str2, Optional intSize = 256)
Dim intTotalLen, arrDistance, intLen1, intLen2, i, j, arrStr1, arrStr2, arrDA, intMini
Dim intDB, intI1, intJ1, intD
str1 = UCase(str1)
str2 = UCase(str2)
intLen1 = Len(str1)
intLen2 = Len(str2)
intTotalLen = intLen1 + intLen2
ReDim arrStr1(intLen1)
ReDim arrStr2(intLen2)
ReDim arrDA(intSize)
ReDim arrDistance(intLen1 + 2, intLen2 + 2)
arrDistance(0, 0) = intTotalLen
For i = 0 To intSize - 1
arrDA(i) = 0
Next
For i = 0 To intLen1
arrDistance(i + 1, 1) = i
arrDistance(i + 1, 0) = intTotalLen
Next
For i = 1 To intLen1
arrStr1(i - 1) = Asc(Mid(str1, i, 1))
Next
For j = 0 To intLen2
arrDistance(1, j + 1) = j
arrDistance(0, j + 1) = intTotalLen
Next
For j = 1 To intLen2
arrStr2(j - 1) = Asc(Mid(str2, j, 1))
Next
For i = 1 To intLen1
intDB = 0
For j = 1 To intLen2
intI1 = arrDA(arrStr2(j - 1))
intJ1 = intDB
If arrStr1(i - 1) = arrStr2(j - 1) Then
intD = 0
Else
intD = 1
End If
If intD = 0 Then intDB = j
intMini = arrDistance(i, j) + intD
If intMini > arrDistance(i + 1, j) + 1 Then intMini = arrDistance(i + 1, j) + 1
If intMini > arrDistance(i, j + 1) + 1 Then intMini = arrDistance(i, j + 1) + 1
If intMini > arrDistance(intI1, intJ1) + i - intI1 + j - intJ1 - 1 Then intMini = arrDistance(intI1, intJ1) + i - intI1 + j - intJ1 - 1
arrDistance(i + 1, j + 1) = intMini
Next
arrDA(arrStr1(i - 1)) = i
Next
DamerauLevenshtein = arrDistance(intLen1 + 1, intLen2 + 1)
End Function
如果我输入&#34; Cire&#34;算法正确地返回&#34; CORE&#34;。
&#34;拉曼&#34;返回&#34; REMAN&#34; &#34; Cosnigned&#34;返回&#34;委托
然而,&#34;现在&#34; 应该返回&#34;新&#34;但返回&#34; OCM&#34;。
&#34;新&#34;也回归&#34; OCM&#34; (所以距离应为0,但为2。)
&#34; FP&#34;应该是&#34; FP&#34;但返回&#34; OCM&#34;,距离是2
&#34; DPF&#34;应该是&#34; DPF&#34;但返回&#34; OCM&#34;,距离是2
我刚刚了解了这个算法,所以我确定我错过了一些重要的东西,但我无法看到它。想法?
答案 0 :(得分:0)
我明白了。经过多次搜索,我发现一篇帖子说编辑距离通常为2.(他们没有说明为什么2是常见的任何优点)
我将if语句从4改为2,现在所有的问题条款都得到了纠正。