我的电子表格包含大约800,000行,包含30列。客户只对一列中的重复值感兴趣。他们需要整回来。例如,
MemberId|Name|Address|CircleScore
H111|John Doe|123 W Main|2.4
H222|Jane Doe|124 W Main|3.2
H333|Bob Doe|125 W Main|2.5
H444|Jake Doe|126 W Main|2.1
H555|Mike Doe|127 W Main|2.4
他们想要CircleScore中存在重复的整行。所以我过滤的excel应该只包含:
MemberId|Name|Address|CircleScore
H111|John Doe|123 W Main|2.4
H555|Mike Doe|127 W Main|2.4
我尝试突出显示重复的CircleScore和过滤,但过滤部分需要永远。我等了15分钟,但仍然没有运气。重复数据可能约为150k。
有替代方案吗?
答案 0 :(得分:3)
我会创建一个Is_Duplicated
指标列,并使用它来过滤重复的CircleScores
:
<小时/> 更新(每条评论):
或者,您可以sort
CircleScore
列,并使公式对您的系统减少负担(注意CircleScore
必须预先排序):
答案 1 :(得分:1)
请忽略此提交,如果您是a)按小时收到工资并感觉薪水过低,b)在例行程序中计划小睡,或c)a)和b)。 < / p>
如果任何数据集接近800K行(30列),您将要进入变体数组竞技场。处理工作表值的时间通常为处理工作表值的5-7%,因此非常适合大型数据块。
任何时候“重复”这个词发挥作用,我立即开始思考Scripting.Dictionary对象在Keys上的唯一索引如何受益。在此解决方案中,我使用一对字典来识别具有重复 Circle Score 值的数据行。
需要存储和传输大量数据单元。批量方法每次都会击败单个方法,剥离数据的最大方法是将所有800K行×30列填充到变量数组中。所有处理都在内存中,结果将返回到报告工作表 en masse 。
isolateDuplicateCircleScores代码
Sub isolateDuplicateCircleScores()
Dim d As Long, v As Long, csc As Long, stmp As String
Dim ky As Variant, itm As Variant, vVALs As Variant, dCSs As Object, dDUPs As Object
Dim w As Long, vWSs As Variant
'early binding
'dim dCSs As new scripting.dictionary, dDUPs As new scripting.dictionary
appTGGL bTGGL:=False
'late binding - not necessary with Early Binding (see footnote ¹)
Set dCSs = CreateObject("Scripting.Dictionary")
Set dDUPs = CreateObject("Scripting.Dictionary")
'set to the defaults (not necessary)
dCSs.comparemode = vbBinaryCompare
dDUPs.comparemode = vbBinaryCompare
'for testing on multiple row number scenarios
'vWSs = Array("CircleScores_8K", "CircleScores_80K", "CircleScores_800K")
'for runtime
vWSs = Array("CircleScores") '<~~ your source worksheet here
For w = LBound(vWSs) To UBound(vWSs)
'ThisWorkbook.Save
Debug.Print vWSs(w)
Debug.Print Timer
With Worksheets(vWSs(w))
On Error Resume Next
Worksheets(vWSs(w) & "_dupes").Delete
On Error GoTo 0
ReDim vVALs(0)
dCSs.RemoveAll
dDUPs.RemoveAll
'prep a new worksheet to receive the duplicates
.Cells(1, 1).CurrentRegion.Resize(2).Copy
With Worksheets.Add(after:=Worksheets(.Index))
.Name = vWSs(w) & "_dupes"
With .Cells(1, 1)
.PasteSpecial Paste:=xlPasteAllUsingSourceTheme, Operation:=xlNone
.PasteSpecial Paste:=xlPasteColumnWidths, Operation:=xlNone
.Value = .Value2
.Offset(1, 0).EntireRow.ClearContents
End With
End With
'finish prep with freeze row 1 and zoom to 80%
With Application.Windows(1)
.SplitColumn = 0
.SplitRow = 1
.FreezePanes = True
.Zoom = 80
End With
'grab all of the data into a variant array
ReDim vVALs(0)
csc = Application.Match("CircleScore", .Rows(1), 0) 'CircleScore column number needed later
vVALs = .Range(.Cells(2, 1), _
.Cells(.Cells(Rows.Count, csc).End(xlUp).Row, _
.Cells(1, Columns.Count).End(xlToLeft).Column)).Value2
'Debug.Print LBound(vVALs, 1) & ":" & UBound(vVALs, 1) '1:~800K
'Debug.Print LBound(vVALs, 2) & ":" & UBound(vVALs, 2) '1:~30
End With 'done with the original worksheet
'populate the dDUPs dictionary using the key index in dCSs
For v = LBound(vVALs, 1) To UBound(vVALs, 1)
If dCSs.exists(vVALs(v, csc)) Then
stmp = vVALs(v, 1)
For d = LBound(vVALs, 2) + 1 To UBound(vVALs, 2)
stmp = Join(Array(stmp, vVALs(v, d)), ChrW(8203))
Next d
dDUPs.Add Key:=v, Item:=stmp
If Not dDUPs.exists(dCSs.Item(vVALs(v, csc))) Then
stmp = vVALs(dCSs.Item(vVALs(v, csc)), 1)
For d = LBound(vVALs, 2) + 1 To UBound(vVALs, 2)
stmp = Join(Array(stmp, vVALs(dCSs.Item(vVALs(v, csc)), d)), ChrW(8203))
Next d
dDUPs.Add Key:=dCSs.Item(vVALs(v, csc)), Item:=stmp
End If
Else
dCSs.Item(vVALs(v, csc)) = v
End If
Next v
'split the dDUPs dictionary items back into a variant array
d = 1
ReDim vVALs(1 To dDUPs.Count, 1 To UBound(vVALs, 2))
For Each ky In dDUPs.keys
itm = Split(dDUPs.Item(ky), ChrW(8203))
For v = LBound(itm) To UBound(itm)
vVALs(d, v + 1) = itm(v)
Next v
d = d + 1
Next ky
'put the values into the duplicates worksheet
With Worksheets(vWSs(w) & "_dupes")
.Cells(2, 1).Resize(UBound(vVALs, 1), UBound(vVALs, 2)) = vVALs
With .Cells(1, 1).CurrentRegion
With .Resize(.Rows.Count - 1, .Columns.Count).Offset(1, 0)
.Rows(1).Copy
.PasteSpecial Paste:=xlPasteFormats, Operation:=xlNone
End With
.Cells.Sort Key1:=.Columns(csc), Order1:=xlAscending, _
Key2:=.Columns(1), Order2:=xlAscending, _
Orientation:=xlTopToBottom, Header:=xlYes
End With
End With
Debug.Print Timer
Next w
dCSs.RemoveAll: Set dCSs = Nothing
dDUPs.RemoveAll: Set dDUPs = Nothing
appTGGL
End Sub
Public Sub appTGGL(Optional bTGGL As Boolean = True)
With Application
.ScreenUpdating = bTGGL
.EnableEvents = bTGGL
.DisplayAlerts = bTGGL
.AutoRecover.Enabled = bTGGL 'no interruptions with an auto-save
.Calculation = IIf(bTGGL, xlCalculationAutomatic, xlCalculationManual)
.CutCopyMode = False
.StatusBar = vbNullString
End With
Debug.Print Timer
End Sub
<强> Sample Data and Results 强>
~123K行×30列重复行(按大约一分半的时间排序和格式化)
<强> Timed Results 强>
tbh,我从未在旧笔记本电脑上获得32位版本的Excel,无需重新启动Excel即可多次运行800K传递。重新启动后,结果与显示的内容一致。 64位Excel在没有打嗝的情况下反复运行。
大型工作表附录
在处理包含大数据块的工作表时,有一些常规改进可能会限制您的等待时间。您使用Excel作为中型数据库工具,因此将数据工作表视为原始数据。
TODAY()
填充了行的范围,您可以更经常地坐在拇指上。 ¹如果您可以将Scripting.Dictionary的后期绑定转换为早期绑定,则必须将 Microsoft Scripting Runtime 添加到VBE的工具►参考。 < / p>
²当整个工作簿中的任何内容发生更改时,挥发性函数会重新计算,而不仅仅是在影响其结果的内容发生更改时。 易失性函数的示例包括INDIRECT,OFFSET,TODAY,NOW,RAND和RANDBETWEEN。 CELL和INFO工作表函数的某些子函数也会使它们变得不稳定。
答案 2 :(得分:0)
试试这个Vba代码(并学习一点荷兰语)
Sub DuplicatesInColumn()
'maakt een lijst met de aangetroffen dubbelingen
Dim LaatsteRij As Long
Dim MatchNr As Long
Dim iRij, iKolom, iTeller, Teller As Long, ControlKolom As Long
iRij = 1
iKolom = 5 'number of columns in the sheet, Chance if not correct
ControlKolom = 4 'column number where to find the doubles, Chance if not correct
LaatsteRij = Cells(65000, iKolom).End(xlUp).Row: iTeller = iKolom
Sheet1.Activate
For iRij = 1 To LaatsteRij
If Cells(iRij, ControlKolom) <> "" Then
MatchNr = WorksheetFunction.Match(Cells(iRij, ControlKolom), Range(Cells(1, ControlKolom), Cells(LaatsteRij, ControlKolom)), 0)
If iRij <> MatchNr Then
iTeller = iKolom
For Teller = 1 To iTeller
Cells(iRij, iKolom + Teller).Offset(0, 2).Value = Range(Cells(iRij, Teller), Cells(iRij, Teller)).Value
Next Teller
End If: End If
Next
End Sub