我有供应商列表,我想查看它们是否有任何可能的重复。
我们来看一些供应商名称的例子:
1. The Supplier GmbH
2. Trading Company LLC & Co. KG
3. DHL Express
4. DHL-Express Inc.
5. Supplier GmbH
6. Trading S.a.r.l.
我创建了两个正则表达式函数:
StripNonAlpha
删除所有非字母字符和两个字母单词并用空格替换“ - ”,WordMatch
取两个参数,如果公司名称中存在此特定单词,则返回True
我想检查整个单词,而不是部分单词,这就是为什么我没有使用InStr
)。
从上面获取供应商名称,我希望例如供应商1和5,3和4标记为可能的重复,但不是2和6。
我有大约100K供应商要检查,但代码运行速度很慢。任何线索如何优化它?
Function StripNonAlpha(TextToReplace As String) As String
Dim ObjRegex As Object
Set ObjRegex = CreateObject("vbscript.regexp")
With ObjRegex
.Global = True
.Pattern = "[^a-zA-Z\s]+"
StripNonAlpha = .Replace(Replace(TextToReplace, "-", Chr(32)), vbNullString)
.Pattern = "\b.{2}\b"
StripNonAlpha = .Replace(StripNonAlpha, vbNullString)
End With
End Function
Function WordMatch(Source As String, MatchExprValue As String) As Boolean
Dim RE As Object
Set RE = CreateObject("vbscript.regexp")
RE.IgnoreCase = True
RE.Pattern = "\b" & MatchExprValue & "\b"
WordMatch = RE.test(Source)
End Function
Sub possible_duplicatev2()
Dim arr1() As String
Dim exclude(1 To 6) As String
Dim arr2() As String
Dim companyname As Range
Dim i As Long
Dim j As Long
Dim k As Long
Dim l As Long
Dim FoundCount As Long
Dim DuplicateCount As Long
Dim rc As Long
Dim scompanyname As String
Dim x As Long
Dim y As Long
exclude(1) = "sarl"
exclude(2) = "gmbh"
exclude(3) = "llc"
exclude(4) = "inc"
exclude(5) = "the"
exclude(6) = "sas"
rc = Range("A" & Rows.Count).End(xlUp).Row
For x = rc To 2 Step -1
If LCase(Range("B" & x).Text) Like "*zzz*" Or LCase(Range("B" & x).Text) Like "*xxx*" Or LCase(Range("B" & x).Text) Like "*yyy*" Then
Range("B" & x).EntireRow.Delete
End If
Next x
rc = Range("A" & Rows.Count).End(xlUp).Row - 1
ReDim arr1(1 To rc, 1 To 2)
For Each companyname In Range("B2", Range("B1").End(xlDown))
scompanyname = StripNonAlpha(LCase(companyname))
arr1(companyname.Row - 1, 1) = scompanyname
Next companyname
For i = 1 To UBound(arr1, 1)
For j = 1 To UBound(exclude)
If WordMatch(arr1(i, 1), exclude(j)) = True Then
Replace arr1(i, 1), exclude(j), ""
End If
Next j
arr2 = Split(arr1(i, 1), " ")
For k = 1 To UBound(arr1, 1)
For l = 0 To UBound(arr2)
If k = i Then
GoTo nextk
ElseIf WordMatch(arr1(k, 1), arr2(l)) = True Then
FoundCount = FoundCount + 1
End If
Next l
If UBound(arr2) = 1 And FoundCount = 1 Then
arr1(k, 2) = "Yes"
DuplicateCount = DuplicateCount + 1
ElseIf UBound(arr2) > 0 And FoundCount > 1 Then
arr1(k, 2) = "Yes"
DuplicateCount = DuplicateCount + 1
Else
arr1(k, 2) = "No"
End If
FoundCount = 0
nextk:
Next k
If DuplicateCount > 0 Then
arr1(i, 2) = "Yes"
Else
arr1(i, 2) = "No"
End If
DuplicateCount = 0
Next i
For y = 1 To UBound(arr1, 1)
Range("D" & y + 1) = arr1(y, 2)
Next y
End Sub