使用VBA,我需要将当前分隔的文本文件(数百列数万行)中的数据“展开”为标准化形式(四列数百万行);也就是说,结果表将包含每个单元格的列:
对于如何有效地完成这项任务,我会感激不尽。
到目前为止,我已经考虑过使用ADODB构建一个构建输出表的SELECT INTO ... UNION ...
查询,但是默认的文本文件提供程序很可能只限于255列(有没有哪些列?)。< / p>
SébastienLorion建造了一个非常棒的Fast CSV Reader,我很乐意使用它,但我不知道如何在VBA中使用它 - 感谢任何想法(我认为它没有被编译到导出COM接口,我没有重新编译它的工具)。就此而言,微软还提供TextFieldParser类,但我不知道是否/如何在VBA中使用它。
另一种方法可能是让Excel&gt; = 2007打开源文件并从那里构建输出表,但直觉上“感觉”好像会产生相当大的浪费开销......
答案 0 :(得分:2)
已编译但未经过测试
Sub UnpivotFile(sPath As String)
Const DELIM As String = ","
Const QUOTE As String = """"
Dim FSO As New FileSystemObject
Dim arrHeader
Dim arrContent
Dim lb As Integer, ub As Integer
Dim x As Integer
Dim inData As Boolean
Dim l As String, fName As String
Dim fIn As Scripting.TextStream
Dim fOut As Scripting.TextStream
Dim tmp As String
Dim lineNum As Long
fName = FSO.GetFileName(sPath)
Set fIn = FSO.OpenTextFile(sPath, ForReading)
Set fOut = FSO.OpenTextFile(sPath & "_out", ForWriting)
lineNum = 0
Do While Not fIn.AtEndOfStream
l = fIn.ReadLine
lineNum = lineNum + 1
arrContent = ParseLineToArray(l, DELIM, QUOTE)
If Not inData Then
arrHeader = arrContent
lb = LBound(arrHeader)
ub = UBound(arrHeader)
inData = True
Else
For x = lb To ub
fOut.WriteLine Join(Array(fName, lineNum, _
QID(arrHeader(x), DELIM, QUOTE), _
QID(arrContent(x), DELIM, QUOTE)), DELIM)
Next x
End If
Loop
fIn.Close
fOut.Close
End Sub
'quote if delimiter found
Function QID(s, d As String, q As String)
QID = IIf(InStr(s, d) > -1, q & s & q, s)
End Function
'Split a string into an array based on a Delimiter and a Text Identifier
Private Function ParseLineToArray(sInput As String, m_Delim As String, _
m_TextIdentifier As String) As Variant
'Dim vArr As Variant
Dim sArr() As String
Dim bInText As Boolean
Dim i As Long, n As Long
Dim sTemp As String, tmp As String
If sInput = "" Or InStr(1, sInput, m_Delim) = 0 Then
'zero length string, or delimiter not present
'dump all input into single-element array (minus Text Identifier)
ReDim sArr(0)
sArr(0) = Replace(sInput, m_TextIdentifier, "")
ParseLineToArray = sArr()
Else
If InStr(1, sInput, m_TextIdentifier) = 0 Then
'no text identifier so just split and return
sArr() = Split(sInput, m_Delim)
ParseLineToArray = sArr()
Else
'found the text identifier, so do it the long way
bInText = False
sTemp = ""
n = 0
For i = 1 To Len(sInput)
tmp = Mid(sInput, i, 1)
If tmp = m_TextIdentifier Then
'just toggle the flag - don't add to string
bInText = Not bInText
Else
If tmp = m_Delim Then
If Not bInText Then
'delimiter not within quoted text, so add next array member
ReDim Preserve sArr(n)
sArr(n) = sTemp
sTemp = ""
n = n + 1
Else
sTemp = sTemp & tmp
End If
Else
sTemp = sTemp & tmp
End If 'character is a delimiter
End If 'character is a quote marker
Next i
ReDim Preserve sArr(n)
sArr(n) = sTemp
ParseLineToArray = sArr()
End If 'has any quoted text
End If 'parseable
End Function
答案 1 :(得分:1)
这应该足够快(在我的机器上18MB文件需要8秒,但我只复制数据,我不重组它 - 如果你不做计算但只重新排序你应该得到相同的东西一种表现)。即使行/列的数量不适合电子表格,它也能正常工作。
TODO :它有点长,但您应该能够(a)复制粘贴它(b)更改文件名和(c)修改manipulateData函数以满足您的需要。其余的代码是一堆可重用的实用程序函数,您不需要更改它们。
我不确定使用VBA可以获得更快的速度 - 如果你需要更快,你应该考虑另一种语言。通常,Java或C#中的相同代码会短得多,因为它们已经有标准库来读/写文件等,而且速度也会更快。
Option Explicit
Public Sub doIt()
Dim sourceFile As String
Dim destinationFile As String
Dim data As Variant
Dim result As Variant
sourceFile = "xxxxxxx"
destinationFile = "xxxxxxx"
data = getDataFromFile(sourceFile, ",")
If Not isArrayEmpty(data) Then
result = manipulateData(data)
writeToCsv result, destinationFile, ","
Else
MsgBox ("Empty file")
End If
End Sub
Function manipulateData(sourceData As Variant) As Variant
Dim result As Variant
Dim i As Long
Dim j As Long
Dim k As Long
Dim m As Long
'redim the result array to the right size - here I only copy so same size as source
ReDim result(1 To UBound(sourceData, 1), 1 To UBound(sourceData, 2)) As Variant
For i = LBound(sourceData, 1) To UBound(sourceData, 1)
For j = LBound(sourceData, 2) To UBound(sourceData, 2)
k = i 'k to be defined - here I only copy data
m = j 'm to be defined - here I only copy data
result(k, m) = sourceData(i, j)
Next j
Next i
manipulateData = result
End Function
Public Sub writeToCsv(parData As Variant, parFileName As String, parDelimiter As String)
If getArrayNumberOfDimensions(parData) <> 2 Then Exit Sub
Dim i As Long
Dim j As Long
Dim fileNum As Long
Dim locLine As String
Dim locCsvString As String
fileNum = FreeFile
If Dir(parFileName) <> "" Then Kill (parFileName)
Open parFileName For Binary Lock Read Write As #fileNum
For i = LBound(parData, 1) To UBound(parData, 1)
locLine = ""
For j = LBound(parData, 2) To UBound(parData, 2)
If IsError(parData(i, j)) Then
locLine = locLine & "#N/A" & parDelimiter
Else
locLine = locLine & parData(i, j) & parDelimiter
End If
Next j
locLine = Left(locLine, Len(locLine) - 1)
If i <> UBound(parData, 1) Then locLine = locLine & vbCrLf
Put #fileNum, , locLine
Next i
error_handler:
Close #fileNum
End Sub
Public Function isArrayEmpty(parArray As Variant) As Boolean
'Returns false if not an array or dynamic array that has not been initialised (ReDim) or has been erased (Erase)
If IsArray(parArray) = False Then isArrayEmpty = True
On Error Resume Next
If UBound(parArray) < LBound(parArray) Then isArrayEmpty = True: Exit Function Else: isArrayEmpty = False
End Function
Public Function getArrayNumberOfDimensions(parArray As Variant) As Long
'Returns the number of dimension of an array - 0 for an empty array.
Dim i As Long
Dim errorCheck As Long
If isArrayEmpty(parArray) Then Exit Function 'returns 0
On Error GoTo FinalDimension
'Visual Basic for Applications arrays can have up to 60000 dimensions
For i = 1 To 60001
errorCheck = LBound(parArray, i)
Next i
'Not supposed to happen
getArrayNumberOfDimensions = 0
Exit Function
FinalDimension:
getArrayNumberOfDimensions = i - 1
End Function
Private Function getDataFromFile(parFileName As String, parDelimiter As String, Optional parExcludeCharacter As String = "") As Variant
'parFileName is supposed to be a delimited file (csv...)
'parDelimiter is the delimiter, "," for example in a comma delimited file
'Returns an empty array if file is empty or can't be opened
'number of columns based on the line with the largest number of columns, not on the first line
'parExcludeCharacter: sometimes csv files have quotes around strings: "XXX" - if parExcludeCharacter = """" then removes the quotes
Dim locLinesList() As Variant
Dim locData As Variant
Dim i As Long
Dim j As Long
Dim locNumRows As Long
Dim locNumCols As Long
Dim fso As Variant
Dim ts As Variant
Const REDIM_STEP = 10000
Set fso = CreateObject("Scripting.FileSystemObject")
On Error GoTo error_open_file
Set ts = fso.OpenTextFile(parFileName)
On Error GoTo unhandled_error
'Counts the number of lines and the largest number of columns
ReDim locLinesList(1 To 1) As Variant
i = 0
Do While Not ts.AtEndOfStream
If i Mod REDIM_STEP = 0 Then
ReDim Preserve locLinesList(1 To UBound(locLinesList, 1) + REDIM_STEP) As Variant
End If
locLinesList(i + 1) = Split(ts.ReadLine, parDelimiter)
j = UBound(locLinesList(i + 1), 1) 'number of columns
If locNumCols < j Then locNumCols = j
If j = 13 Then
j = j
End If
i = i + 1
Loop
ts.Close
locNumRows = i
If locNumRows = 0 Then Exit Function 'Empty file
ReDim locData(1 To locNumRows, 1 To locNumCols + 1) As Variant
'Copies the file into an array
If parExcludeCharacter <> "" Then
For i = 1 To locNumRows
For j = 0 To UBound(locLinesList(i), 1)
If Left(locLinesList(i)(j), 1) = parExcludeCharacter Then
If Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
locLinesList(i)(j) = Mid(locLinesList(i)(j), 2, Len(locLinesList(i)(j)) - 2) 'If locTempArray = "", Mid returns ""
Else
locLinesList(i)(j) = Right(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
End If
ElseIf Right(locLinesList(i)(j), 1) = parExcludeCharacter Then
locLinesList(i)(j) = Left(locLinesList(i)(j), Len(locLinesList(i)(j)) - 1)
End If
locData(i, j + 1) = locLinesList(i)(j)
Next j
Next i
Else
For i = 1 To locNumRows
For j = 0 To UBound(locLinesList(i), 1)
locData(i, j + 1) = locLinesList(i)(j)
Next j
Next i
End If
getDataFromFile = locData
Exit Function
error_open_file: 'returns empty variant
unhandled_error: 'returns empty variant
End Function
答案 2 :(得分:1)
我决定在VB.NET中围绕TextFieldParser
构建一个微小的COM感知包装器。不理想,但目前我能想到的最好。
答案 3 :(得分:0)
我过去曾亲自使用CSV Reader来解析巨大的CSV文件(最高1 GB)。性能和简洁性令人难以置信。我强烈建议您使用它。
由于您说您使用过VB.NET,我建议您构建一个引用CSV Reader的简单控制台应用程序。此控制台应用程序将csv文件的路径作为命令行参数“unpivot”。然后,从VBA,您可以使用VBA.Shell来运行您的控制台应用程序,并为其提供CSV文件的路径作为参数。