我尝试加载长度为64个字符的6.000.000(6 mio)字符串,以便在VBA中对它们进行排序。 我注意到的是: 1.当我使用数组时,占用的内存大约为916 MB 2.当我使用ArrayList时,占用的内存为1.105 MB 它们都不合理,因为字符串大小约为380 MB。 我做错了什么? 随着弦乐数量的快速增长,我将很快面对“失去记忆”。 任何想法都会受到欢迎。
Demetres
答案 0 :(得分:3)
大多数问题是VBA本身使用BSTRs,这是Unicode字符串。我假设您的计算~380 mb基于600万* 64个字符@每个1字节。实际上,数学算法是这样的:
VBA字符串是Unicode,在这种情况下表示每个字符为2 字节。
VBA中的字符串是4个字节,用于内部存储之前的长度 字符串,字符串末尾的unicode Null为2个字节,和 每个字符2个字节。
每64个字符为4 +(64 * 2)+ 2 = 134个字符 字符串。
String数组中的每个条目实际上都是指向String的指针,
所以每个插槽还有4个字节,到目前为止共有138个字节。
假设有600万个这样的字符串,即828,000,000字节(使用 逗号(美国风格),depending upon your definition of mb,为789.6或828 mb。
我不确定其余的开销,也许是垃圾收集器引用计数器?
无论如何,我建议您使用64个插槽字节数组来加载和存储字符串,假设它是ASCII字符。消除(4 + 64 + 2)* 6,000,000字节,您的代码可能会运行得更快,因为它不需要比较多少字节。您可以通过一次比较Word(32位或64位,具体取决于您的处理器)而不是逐个字符来优化排序。
<强>更新强>
我认为我在计算上有点不对劲。字节数是SAFEARRAYs,它们本身有很多开销,大约20个字节。因此节省的资金将接近(4 + 64 + 2 - 20)* 6,000,000。
在您查看此示例之前,请请接受我的建议并将您的文本导入Access以进行排序。 600万字符串总共380 MB,完全在Access' limits之内,Access可以(据我所知)对它们进行排序,而无需同时将所有字符串加载到内存中
使用以下文本创建名为“data.txt”的文本文件:
This
Is
A
File
Of
Strings
To
Sort
添加代码模块并将其命名为“mdlQuickSort”并添加以下代码。我没有评论太多,但如果你对它是如何工作感到好奇,你可以阅读Wikipedia's article on QuickSort或让我知道,我会添加更好的评论。
Option Explicit
Public Sub QuickSortInPlace(ByRef arrArray() As Variant)
If UBound(arrArray) <= 1 Then
Exit Sub
End If
qSort arrArray, 0, UBound(arrArray)
End Sub
Private Sub qSort(ByRef arrArray() As Variant, left As Long, right As Long)
Dim pivot As Long
Dim newPivotIndex As Long
If left < right Then
pivot = MedianOf3(arrArray, left, right)
newPivotIndex = partition(arrArray, left, right, pivot)
qSort arrArray, left, newPivotIndex - 1
qSort arrArray, newPivotIndex + 1, right
End If
End Sub
Private Function partition(ByRef arrArray() As Variant, left As Long, right As Long, pivot As Long) As Long
Dim pivotValue As Variant
pivotValue = arrArray(pivot)
Swap arrArray, pivot, right
Dim storeIndex As Long
storeIndex = left
Dim i As Long
For i = left To right - 1
If CompareFunc(arrArray(i), pivotValue) = -1 Then
Swap arrArray, i, storeIndex
storeIndex = storeIndex + 1
End If
Next
Swap arrArray, storeIndex, right
partition = storeIndex
End Function
Private Sub Swap(ByRef arrArray() As Variant, indexA As Long, indexB As Long)
Dim temp As Variant
temp = arrArray(indexA)
arrArray(indexA) = arrArray(indexB)
arrArray(indexB) = temp
End Sub
Private Function MedianOf3(ByRef arrArray() As Variant, left As Long, right As Long) As Long
Dim a As Variant, b As Variant, c As Variant
Dim indexA As Long, indexB As Long, indexC As Long
Dim ab As Long
Dim bc As Long
Dim ac As Long
indexA = left
indexB = (left + right) \ 2
indexC = right
a = arrArray(indexA)
b = arrArray(indexB)
c = arrArray(indexC)
ab = CompareFunc(a, b)
bc = CompareFunc(b, c)
ac = CompareFunc(a, c)
If ab = -1 Then
If ac = -1 Then
If bc = -1 Or bc = 0 Then
'a b c
'Already in B
Else
'a c b
Swap arrArray, indexB, indexC
End If
Else
'c a b
Swap arrArray, indexA, indexB
End If
Else
If bc = -1 Then
If ac = -1 Then
'b a c
Swap arrArray, indexA, indexB
Else
'b c a
Swap arrArray, indexB, indexC
End If
Else
'c b a
'Already in B
End If
End If
MedianOf3 = indexB
End Function
Private Function CompareFunc(str_a As Variant, str_b As Variant) As Long
Dim a As Byte
Dim b As Byte
Dim i As Long
For i = 0 To 63
a = str_a(i)
b = str_b(i)
If a <> b Then
Exit For
End If
Next
If i <= 63 Then
If a < b Then
CompareFunc = -1
Else
CompareFunc = 1
End If
Else
CompareFunc = 0
End If
End Function
最后,添加一个名为“mdlMain”的模块。这是字符串加载的地方。这是代码:
Option Explicit
Public Sub Main()
Dim arrStrings() As Variant
Dim i As Long
'Get the strings from the file
FillArrStringsInPlace arrStrings
'Print the unsorted list
Debug.Print "Unsorted Strings" & vbCrLf & "---------------------"
For i = 0 To UBound(arrStrings)
Debug.Print StrConv(arrStrings(i), vbUnicode)
Next
'Sort in place
QuickSortInPlace arrStrings
'Print the sorted list
Debug.Print vbCrLf & vbCrLf & "Sorted Strings" & vbCrLf & "---------------------"
For i = 0 To UBound(arrStrings)
Debug.Print StrConv(arrStrings(i), vbUnicode)
Next
End Sub
Public Sub FillArrStringsInPlace(ByRef arr() As Variant)
Dim iFile As Integer
Dim strInput As String
Dim lineCount As Long
Dim arrBytes() As Byte
'Open a file in the same folder as this Access db called "data.txt"
iFile = FreeFile
Open ActiveWorkbook.Path & "\data.txt" For Input As iFile
'Since I already know how many strings there are, I'm assigning it here.
'The alternatives would be to either "dynamically resize" the array, which
'is equivalent to copying the entire thing everytime you add a new string,
'Or to count the number of newlines in the file and dimensioning the array
'to that size before reading in the strings line by line. Neither is as
'efficient as just defining it before-hand.
ReDim arr(0 To 7)
While Not EOF(iFile)
Line Input #iFile, strInput
arrBytes = StrConv(strInput, vbFromUnicode)
ReDim Preserve arrBytes(0 To 63)
arr(lineCount) = arrBytes
lineCount = lineCount + 1
Wend
Close iFile
End Sub
我已经在那里放了一些代码来尝试使用CopyMemory进行优化,但这有点危险,所以我决定将其删除。