我试图做的是,创建一个模糊查找算法,该算法可以帮助我们将这些唯一记录与映射表进行匹配,并向我们显示百分比。我正在应用Levenshtein算法来找到匹配项,这是众所周知的模糊查找算法。通过在两个字符串之间插入,删除和交换不匹配的字符,基本上可以找到两个源之间的距离。 但是,Levenshtein有其自身的结构问题。对于小长度的文本,Levenshtein的效果不是很好。例如,荷航和荷航皇家荷兰航空公司是同一回事,但由于首字母长度不足,Levenshtein将荷航与摩尔多瓦航空相匹配。 所以,我想通了,我需要一种逻辑来将Levenshtein与Syllable匹配和子字符串匹配结合在一起。 如果你们帮助我,我会很高兴。如下所示,您可以看到我的插件当前使用的代码。
' ------------------------------------‘
'Before here, i create arrays from ranges and all arrays are two dimensional arrays. Match and unmatched values are arr1 and arr2 at below code
For m = LBound(arr1, 1) To UBound(arr1, 1)
aresult = 0
qnumber = 0
For n = LBound(arr2, 1) To UBound(arr2, 1)
qnumber = qnumber + 1
a = Fuzzy(CStr(arr1(m, 1)), CStr(arr2(n, 1)))
If a > aresult Then
aresult = a
qresult = qnumber
End If
Next n
If aresult = 0 And qresult = 0 Then
arr3(m, 1) = CVErr(xlErrNA)
arr4(m, 1) = CVErr(xlErrNA)
Else
arr3(m, 1) = arr2(qresult, qnum)
arr4(m, 1) = "%" & Round(aresult * 100, 0)
End If
Next m
Private Function Fuzzy(ByVal s1 As String, ByVal s2 As String) As Single
Dim i As Integer, j As Integer, k As Integer, d1 As Integer, d2 As Integer, p As Integer
Dim c As String, a1 As String, a2 As String, f As Single, o As Single, w As Single
' ******* INPUT STRINGS CLEANSING *******
s1 = UCase(s1) 'input strings are converted to uppercase
d1 = Len(s1)
j = 1
For i = 1 To d1
c = Mid(s1, i, 1)
Select Case c
Case "0" To "9", "A" To "Z" 'filter the allowable characters
a1 = a1 & c 'a1 is what remains from s1 after filtering
j = j + 1
End Select
Next
If j = 1 Then Exit Function 'if s1 is empty after filtering
d1 = j - 1
s2 = UCase(s2)
d2 = Len(s2)
j = 1
For i = 1 To d2
c = Mid(s2, i, 1)
Select Case c
Case "0" To "9", "A" To "Z"
a2 = a2 & c
j = j + 1
End Select
Next
If j = 1 Then Exit Function
d2 = j - 1
k = d1
If d2 < d1 Then 'to prevent doubling the code below s1 must be made the shortest string, so we swap the variables
k = d2
d2 = d1
d1 = k
s1 = a2
s2 = a1
a1 = s1
a2 = s2
Else
s1 = a1
s2 = a2
End If
If k = 1 Then 'degenerate case, where the shortest string is just one character
If InStr(1, s2, s1, vbBinaryCompare) > 0 Then
Fuzzy = 1 / d2
Else
Fuzzy = 0
End If
Else '******* MAIN LOGIC HERE *******
i = 1
f = 0
o = 0
Do 'count the identical characters in s1 and s2 ("frequency analysis")
p = InStr(1, s2, Mid(s1, i, 1), vbBinaryCompare) 'search the character at position i from s1 in s2
If p > 0 Then 'found a matching character, at position p in s2
f = f + 1 'increment the frequency counter
s2 = Left(s2, p - 1) & "~" & Mid(s2, p + 1)
Do
If i >= k Then Exit Do 'no more characters to search
If Mid(s2, p + 1, 1) = Mid(s1, i + 1, 1) Then 'test if the next character is the same in the two strings
f = f + 1 'increment the frequency counter
o = o + 1 'increment the order counter
i = i + 1
p = p + 1
Else
Exit Do
End If
Loop
End If
If i >= k Then Exit Do
i = i + 1
Loop
If o > 0 Then o = o + 1
finish:
w = 2
Fuzzy = (w * o + f) / (w + 1) / d2
End If
End Function