模糊查找-具有音节和子字符串匹配的Levenshtein

时间:2019-05-25 11:57:43

标签: excel vba levenshtein-distance fuzzy-search

我试图做的是,创建一个模糊查找算法,该算法可以帮助我们将这些唯一记录与映射表进行匹配,并向我们显示百分比。我正在应用Levenshtein算法来找到匹配项,这是众所周知的模糊查找算法。通过在两个字符串之间插入,删除和交换不匹配的字符,基本上可以找到两个源之间的距离。 但是,Levenshtein有其自身的结构问题。对于小长度的文本,Levenshtein的效果不是很好。例如,荷航和荷航皇家荷兰航空公司是同一回事,但由于首字母长度不足,Levenshtein将荷航与摩尔多瓦航空相匹配。 所以,我想通了,我需要一种逻辑来将Levenshtein与Syllable匹配和子字符串匹配结合在一起。 如果你们帮助我,我会很高兴。如下所示,您可以看到我的插件当前使用的代码。

' ------------------------------------‘
'Before here, i create arrays from ranges and all arrays are two dimensional arrays. Match and unmatched values are arr1 and arr2 at below code
    For m = LBound(arr1, 1) To UBound(arr1, 1)
        aresult = 0
        qnumber = 0
        For n = LBound(arr2, 1) To UBound(arr2, 1)
            qnumber = qnumber + 1
            a = Fuzzy(CStr(arr1(m, 1)), CStr(arr2(n, 1)))
            If a > aresult Then
                aresult = a
                qresult = qnumber
            End If
        Next n
        If aresult = 0 And qresult = 0 Then
            arr3(m, 1) = CVErr(xlErrNA)
            arr4(m, 1) = CVErr(xlErrNA)
        Else
            arr3(m, 1) = arr2(qresult, qnum)
            arr4(m, 1) = "%" & Round(aresult * 100, 0)
        End If
    Next m
Private Function Fuzzy(ByVal s1 As String, ByVal s2 As String) As Single
    Dim i As Integer, j As Integer, k As Integer, d1 As Integer, d2 As Integer, p As Integer
    Dim c As String, a1 As String, a2 As String, f As Single, o As Single, w As Single
    ' ******* INPUT STRINGS CLEANSING *******
    s1 = UCase(s1) 'input strings are converted to uppercase
    d1 = Len(s1)
    j = 1
    For i = 1 To d1
        c = Mid(s1, i, 1)
        Select Case c
            Case "0" To "9", "A" To "Z" 'filter the allowable characters
                a1 = a1 & c 'a1 is what remains from s1 after filtering
                 j = j + 1
        End Select
    Next
    If j = 1 Then Exit Function 'if s1 is empty after filtering
        d1 = j - 1
        s2 = UCase(s2)
        d2 = Len(s2)
        j = 1
        For i = 1 To d2
            c = Mid(s2, i, 1)
            Select Case c
                Case "0" To "9", "A" To "Z"
                    a2 = a2 & c
                    j = j + 1
            End Select
        Next
        If j = 1 Then Exit Function
        d2 = j - 1
        k = d1
        If d2 < d1 Then 'to prevent doubling the code below s1 must be made the shortest string, so we swap the variables
            k = d2
            d2 = d1
            d1 = k
            s1 = a2
            s2 = a1
            a1 = s1
            a2 = s2
        Else
            s1 = a1
            s2 = a2
        End If
        If k = 1 Then 'degenerate case, where the shortest string is just one character
            If InStr(1, s2, s1, vbBinaryCompare) > 0 Then
                Fuzzy = 1 / d2
            Else
                Fuzzy = 0
            End If
        Else '******* MAIN LOGIC HERE *******
            i = 1
            f = 0
            o = 0
            Do 'count the identical characters in s1 and s2 ("frequency analysis")
                p = InStr(1, s2, Mid(s1, i, 1), vbBinaryCompare) 'search the character at position i from s1 in s2
                If p > 0 Then 'found a matching character, at position p in s2
                    f = f + 1 'increment the frequency counter
                    s2 = Left(s2, p - 1) & "~" & Mid(s2, p + 1)
                    Do
                        If i >= k Then Exit Do 'no more characters to search
                            If Mid(s2, p + 1, 1) = Mid(s1, i + 1, 1) Then 'test if the next character is the same in the two strings
                                f = f + 1 'increment the frequency counter
                                o = o + 1 'increment the order counter
                                i = i + 1
                                p = p + 1
                            Else
                                Exit Do
                            End If
                    Loop
                End If
                If i >= k Then Exit Do
                    i = i + 1
            Loop
            If o > 0 Then o = o + 1
finish:
            w = 2
            Fuzzy = (w * o + f) / (w + 1) / d2
        End If

End Function

0 个答案:

没有答案