基于具有模糊匹配的两列合并两个数据库

时间:2015-03-04 15:34:20

标签: database sqlite excel-vba excel-formula openrefine

我有两个大型数据库,每个数据库都有一些Excel格式的列。我需要合并两个数据库,以便我可以合并两个数据库中匹配的行。

问题是我需要使用的列在两个数据库中可能不是100%相同。列将是“地址”列和名称列。地址可能与两个数据库的格式不同。 此外,国家,城市和其他列在两个数据库中都是相同的,可以用作确认标准。

我想到了Google Refine。任何帮助将不胜感激。

1 个答案:

答案 0 :(得分:0)

我处于类似情况,我发现Microsoft developed a Fuzzy Lookup Add-In可能有用。我不知道是否有更新的版本 我也发现了这段代码,但我不记得在哪里,所以我无法引用源代码(我在一些测试期间评论了部分内容,因此请注意):

Option Explicit
Type RankInfo
    Offset As Integer
    Percentage As Single
End Type
Function FuzzyPercent(ByVal String1 As String, _
                      ByVal String2 As String, _
                      Optional Algoritm As Integer = 3) As Single
'*************************************
'** Return a % match on two strings **
'*************************************
Dim intLen1 As Integer
Dim intCurLen As Integer
Dim intTo As Integer
Dim intPos As Integer
Dim intPtr As Integer
Dim intScore As Integer
Dim intTotScore As Integer
Dim intStartPos As Integer
Dim StrWork As String
Dim Str1 As String
Dim Str2 As String
'---------------------------------------------------
'-- Remove surrounding spaces & ensure lower case --
'---------------------------------------------------
Str1 = LCase$(Trim$(String1))
Str2 = LCase$(Trim$(String2))
'----------------------------------------------
'-- Give 100% match if strings exactly equal --
'----------------------------------------------
If Str1 = Str2 Then
    FuzzyPercent = 1
    Exit Function
End If
intLen1 = Len(Str1)
'----------------------------------------
'-- Give 0% match if string length < 2 --
'----------------------------------------
If intLen1 < 2 Then
    FuzzyPercent = 0
    Exit Function
End If
intTotScore = 0                   'initialise total possible score
intScore = 0                      'initialise current score
'--------------------------------------------------------
'-- If Algoritm = 1 or 3, Search for single characters --
'--------------------------------------------------------
If (Algoritm And 1) <> 0 Then
    intTotScore = intLen1                   'initialise total possible score
    intPos = 0
    For intPtr = 1 To intLen1
        intStartPos = intPos + 1
        intPos = InStr(intStartPos, Str2, Mid$(Str1, intPtr, 1))
        If intPos > 0 Then
            If intPos > intStartPos + 3 Then     'No match if char is > 3 bytes away
                intPos = intStartPos
            Else
                intScore = intScore + 1          'Update current score
            End If
        Else
            intPos = intStartPos
        End If
    Next intPtr
End If
'-----------------------------------------------------------
'-- If Algoritm = 2 or 3, Search for pairs, triplets etc. --
'-----------------------------------------------------------
If (Algoritm And 2) <> 0 Then
    For intCurLen = 2 To intLen1
        StrWork = Str2                          'Get a copy of String2
        intTo = intLen1 - intCurLen + 1
        intTotScore = intTotScore + Int(intLen1 / intCurLen)  'Update total possible score
        For intPtr = 1 To intTo Step intCurLen
            intPos = InStr(StrWork, Mid$(Str1, intPtr, intCurLen))
            If intPos > 0 Then
                Mid$(StrWork, intPos, intCurLen) = String$(intCurLen, &H0) 'corrupt found string
                intScore = intScore + 1     'Update current score
            End If
        Next intPtr
    Next intCurLen
End If
FuzzyPercent = intScore / intTotScore
End Function
Function FuzzyVLookup(ByVal LookupValue As String, _
                      ByVal TableArray As Range, _
                      ByVal IndexNum As Integer, _
                      Optional NFPercent As Single = 0.05, _
                      Optional Rank As Integer = 1, _
                      Optional Algoritm As Integer = 3) As Variant
'********************************************************************************
'** Function to Fuzzy match LookupValue with entries in                        **
'** column 1 of table specified by TableArray.                                 **
'** TableArray must specify the top left cell of the range to be searched      **
'** The function stops scanning the table when an empty cell in column 1       **
'** is found.                                                                  **
'** For each entry in column 1 of the table, FuzzyPercent is called to match   **
'** LookupValue with the Table entry.                                          **
'** 'Rank' is an optional parameter which may take any value > 0               **
'**        (default 1) and causes the function to return the 'nth' best        **
'**         match (where 'n' is defined by 'Rank' parameter)                   **
'** If the 'Rank' match percentage < NFPercent (Default 5%), #N/A is returned. **
'** IndexNum is the column number of the entry in TableArray required to be    **
'** returned, as follows:                                                      **
'** If IndexNum > 0 and the 'Rank' percentage match is >= NFPercent            **
'**                 (Default 5%) the column entry indicated by IndexNum is     **
'**                 returned.                                                  **
'** if IndexNum = 0 and the 'Rank' percentage match is >= NFPercent            **
'**                 (Default 5%) the offset row (starting at 1) is returned.   **
'**                 This value can be used directly in the 'Index' function.   **
'**                                                                            **
'** Algoritm can take one of the following values:                             **
'** Algoritm = 1:                                                              **
'**     For each character in 'String1', a search is performed on 'String2'.   **
'**     The search is deemed successful if a character is found in 'String2'   **
'**     within 3 characters of the current position.                           **
'**     A score is kept of matching characters which is returned as a          **
'**     percentage of the total possible score.                                **
'** Algoritm = 2:                                                              **
'**     A count of matching pairs, triplets, quadruplets etc. in 'String1' and **
'**     'String2' is returned as a percentage of the total possible.           **
'** Algoritm = 3: Both Algoritms 1 and 2 are performed.                        **
'********************************************************************************
Dim strLookupValue As String
Dim strListString As String
Dim StrWork As String
Dim sngMinPercent As Single
Dim sngWork As Single
Dim sngCurPercent  As Single
Dim intBestMatchPtr As Integer
Dim intPtr As Integer
Dim intRankPtr As Integer
Dim intRankPtr1 As Integer
Dim udRankData() As RankInfo
'--------------------------------------------------------------
'--    Validation                                            --
'--------------------------------------------------------------
strLookupValue = Trim$(LCase$(LookupValue))
If IsMissing(NFPercent) Then
    sngMinPercent = 0.05
Else
    If (NFPercent <= 0) Or (NFPercent > 1) Then
        FuzzyVLookup = "*** 'NFPercent' must be a percentage > zero ***"
        Exit Function
    End If
    sngMinPercent = NFPercent
End If
If Rank < 1 Then
    FuzzyVLookup = "*** 'Rank' must be an integer > 0 ***"
    Exit Function
End If
ReDim udRankData(1 To Rank)
'---------------
'-- Main loop --
'---------------
intPtr = 1
Do While VarType(TableArray.Cells(intPtr, 1)) <> vbEmpty
    If VarType(TableArray.Cells(intPtr, 1)) = vbString Then
        strListString = Trim$(LCase$(TableArray.Cells(intPtr, 1)))

        '------------------------------------------------
        '-- Fuzzy match strings & get percentage match --
        '------------------------------------------------
        sngCurPercent = FuzzyPercent(String1:=strLookupValue, _
                                     String2:=strListString, _
                                     Algoritm:=Algoritm)

        If sngCurPercent >= sngMinPercent Then
            '---------------------------
            '-- Store in ranked array --
            '---------------------------
            For intRankPtr = 1 To Rank
                If sngCurPercent > udRankData(intRankPtr).Percentage Then
                    For intRankPtr1 = Rank To intRankPtr + 1 Step -1
                        With udRankData(intRankPtr1)
                            .Offset = udRankData(intRankPtr1 - 1).Offset
                            .Percentage = udRankData(intRankPtr1 - 1).Percentage
                        End With
                    Next intRankPtr1
                    With udRankData(intRankPtr)
                        .Offset = intPtr
                        .Percentage = sngCurPercent
                    End With
                    Exit For
                End If
            Next intRankPtr
        End If

    End If
    intPtr = intPtr + 1
Loop
If udRankData(Rank).Percentage < sngMinPercent Then
    '--------------------------------------
    '-- Return '#N/A' if below NFPercent --
    '--------------------------------------
    FuzzyVLookup = CVErr(xlErrNA)
Else
    intBestMatchPtr = udRankData(Rank).Offset
    If IndexNum > 0 Then
        '-----------------------------------
        '-- Return column entry specified --
        '-----------------------------------
        FuzzyVLookup = TableArray.Cells(intBestMatchPtr, IndexNum)
    Else
        '-----------------------
        '-- Return offset row --
        '-----------------------
        FuzzyVLookup = intBestMatchPtr
    End If
End If
End Function
Function FuzzyHLookup(ByVal LookupValue As String, _
                      ByVal TableArray As Range, _
                      ByVal IndexNum As Integer, _
                      Optional NFPercent As Single = 0.05, _
                      Optional Rank As Integer = 1, _
                      Optional Algoritm As Integer = 3) As Variant
'********************************************************************************
'** Function to Fuzzy match LookupValue with entries in                        **
'** row 1 of table specified by TableArray.                                    **
'** TableArray must specify the top left cell of the range to be searched      **
'** The function stops scanning the table when an empty cell in row 1          **
'** is found.                                                                  **
'** For each entry in row 1 of the table, FuzzyPercent is called to match      **
'** LookupValue with the Table entry.                                          **
'** 'Rank' is an optional parameter which may take any value > 0               **
'**        (default 1) and causes the function to return the 'nth' best        **
'**         match (where 'n' is defined by 'Rank' parameter)                   **
'** If the 'Rank' match percentage < NFPercent (Default 5%), #N/A is returned. **
'** IndexNum is the row number of the entry in TableArray required to be       **
'** returned, as follows:                                                      **
'** If IndexNum > 0 and the 'Rank' percentage match is >= NFPercent            **
'**                 (Default 5%) the row entry indicated by IndexNum is        **
'**                 returned.                                                  **
'** if IndexNum = 0 and the 'Rank' percentage match is >= NFPercent            **
'**                 (Default 5%) the offset col (starting at 0) is returned.   **
'**                 This value can be used directly in the 'OffSet' function.  **
'**                                                                            **
'** Algoritm can take one of the following values:                             **
'** Algoritm = 1:                                                              **
'**     For each character in 'String1', a search is performed on 'String2'.   **
'**     The search is deemed successful if a character is found in 'String2'   **
'**     within 3 characters of the current position.                           **
'**     A score is kept of matching characters which is returned as a          **
'**     percentage of the total possible score.                                **
'** Algoritm = 2:                                                              **
'**     A count of matching pairs, triplets, quadruplets etc. in 'String1' and **
'**     'String2' is returned as a percentage of the total possible.           **
'** Algoritm = 3: Both Algoritms 1 and 2 are performed.                        **
'********************************************************************************
Dim strLookupValue As String
Dim strListString As String
Dim StrWork As String
Dim sngMinPercent As Single
Dim sngWork As Single
Dim sngCurPercentc  As Single
Dim intBestMatchPtr As Integer
Dim intPtr As Integer
Dim intRankPtr As Integer
Dim intRankPtr1 As Integer
Dim udRankData() As RankInfo
'--------------------------------------------------------------
'--    Validation                                            --
'--------------------------------------------------------------
strLookupValue = Trim$(LCase$(LookupValue))
If IsMissing(NFPercent) Then
    sngMinPercent = 0.05
Else
    If (NFPercent <= 0) Or (NFPercent > 1) Then
        FuzzyHLookup = "*** 'NFPercent' must be a percentage > zero ***"
        Exit Function
    End If
    sngMinPercent = NFPercent
End If
If Rank < 1 Then
    FuzzyHLookup = "*** 'Rank' must be an integer > 0 ***"
    Exit Function
End If
ReDim udRankData(1 To Rank)
'---------------
'-- Main loop --
'---------------
intPtr = 1
Do While VarType(TableArray.Cells(1, intPtr)) <> vbEmpty
    If VarType(TableArray.Cells(1, intPtr)) = vbString Then
        strListString = Trim$(LCase$(TableArray.Cells(1, intPtr)))

'        If Rank = 1 Then
'            If strLookupValue = strListString Then
'                '-- 100% match ! --
'                If IndexNum > 0 Then
'                    FuzzyHLookup = TableArray.Cells(IndexNum, intPtr)
'                Else
'                    FuzzyHLookup = intPtr
'                End If
'                Exit Function
'            End If
'        End If

        '------------------------------------------------
        '-- Fuzzy match strings & get percentage match --
        '------------------------------------------------
        sngCurPercent = FuzzyPercent(String1:=strLookupValue, _
                                     String2:=strListString, _
                                     Algoritm:=Algoritm)

        If sngCurPercent >= sngMinPercent Then
            '---------------------------
            '-- Store in ranked array --
            '---------------------------
            For intRankPtr = 1 To Rank
                If sngCurPercent > udRankData(intRankPtr).Percentage Then
                    For intRankPtr1 = Rank To intRankPtr + 1 Step -1
                        With udRankData(intRankPtr1)
                            .Offset = udRankData(intRankPtr1 - 1).Offset
                            .Percentage = udRankData(intRankPtr1 - 1).Percentage
                        End With
                    Next intRankPtr1
                    With udRankData(intRankPtr)
                        .Offset = intPtr
                        .Percentage = sngCurPercent
                    End With
                    Exit For
                End If
            Next intRankPtr
        End If

    End If
    intPtr = intPtr + 1
Loop
If udRankData(Rank).Percentage < sngMinPercent Then
    '--------------------------------------
    '-- Return '#N/A' if below NFPercent --
    '--------------------------------------
    FuzzyHLookup = CVErr(xlErrNA)
Else
    intBestMatchPtr = udRankData(Rank).Offset
    If IndexNum > 0 Then
        '-----------------------------------
        '-- Return row entry specified --
        '-----------------------------------
        FuzzyHLookup = TableArray.Cells(IndexNum, intBestMatchPtr)
    Else
        '-----------------------
        '-- Return offset col --
        '-----------------------
        FuzzyHLookup = intBestMatchPtr
    End If
End If
End Function