我正在通过SQL导入代码通过Excel VBA将数据导入sheet6
。然后,我遍历导入数据的每一行,以基于两个条件寻找匹配项。带有200万行数据的大型文本文件的一行中存在A列和G列的值。如果找到匹配项,则将逗号分隔的文本文件中的第一个值添加到E列。
我曾尝试研究Excel VBA代码,但是在循环访问5000行数据时,这确实很慢。最多可能需要30分钟才能运行。希望Python解决方案可能是答案。
Const strFileName = "T:\Hex\ModeS-Mil.txt"
Sub FillMTextFile()
Dim StartTime As Double
Dim SecondsElapsed As Double
Dim strsearch As String
Dim MReg As String
Dim MType As String
Dim strLine As String
Dim f As Integer
Dim lngLine As Long
Dim blnFound As Boolean
Dim x As Long
Dim lrow As Long
lrow = Sheet6.Range("A" & Rows.Count).End(xlUp).Row
StartTime = Timer
For x = 2 To 3000
MReg = Sheet6.Range("A" & x).Value
MType = Sheet6.Range("G" & x).Value
strsearch = MReg & "," & MType
f = FreeFile
Open strFileName For Input As #f
Do While Not EOF(f)
lngLine = lngLine + 1
Line Input #f, strLine
If InStr(1, strLine, strsearch, vbBinaryCompare) > 0 Then
Sheet6.Range("E" & x).Value = UCase(Split(strLine, ",")(0))
On Error GoTo err
blnFound = True
Exit Do
End If
Loop
Close #f
If Not blnFound Then
End If
err:
Next x
SecondsElapsed = Round(Timer - StartTime, 2)
MsgBox "code time " & SecondsElapsed & " seconds", vbInformation
End Sub
答案 0 :(得分:0)
此方法是否有效取决于以下所有条件是否正确:
Sheet6
的A和G列Sheet6
的E列我希望一切都有意义。 ^
Option Explicit
Const CSV_FILE_PATH As String = "T:\Hex\ModeS-Mil.txt"
Const FIRST_ROW_CONTAINING_DATA As Long = 2 ' Exclude header row'
Sub FillMTextFile()
Dim csvMap As Collection
Set csvMap = GetMapForCsv()
Dim lastRow As Long
lastRow = Sheet6.Cells(Sheet6.Rows.Count, "A").End(xlUp).Row
Debug.Assert lastRow >= FIRST_ROW_CONTAINING_DATA
Dim substringsInColumnA() As Variant ' This needs a better name, especially if your columns change.'
substringsInColumnA = Sheet6.Range("A" & FIRST_ROW_CONTAINING_DATA, "A" & lastRow).Value
Dim substringsInColumnG() As Variant ' This needs a better name, especially if your columns change.'
substringsInColumnG = Sheet6.Range("G" & FIRST_ROW_CONTAINING_DATA, "G" & lastRow).Value
Dim outputRowCount As Long
outputRowCount = lastRow - FIRST_ROW_CONTAINING_DATA + 1
Dim arrayToWriteToSheet() As String
ReDim arrayToWriteToSheet(1 To outputRowCount, 1 To 1)
Debug.Assert LBound(arrayToWriteToSheet, 1) = LBound(substringsInColumnA, 1)
Dim rowIndex As Long
For rowIndex = LBound(substringsInColumnA, 1) To UBound(substringsInColumnA, 1)
Dim substringsToJoin(0 To 1) As String ' Static, re-use every iteration'
substringsToJoin(0) = UCase$(CStr(substringsInColumnA(rowIndex, 1)))
substringsToJoin(1) = UCase$(CStr(substringsInColumnG(rowIndex, 1)))
Dim substringToSearchFor As String
substringToSearchFor = Join$(substringsToJoin, ",")
arrayToWriteToSheet(rowIndex, 1) = _
GetCollectionItemOrDefault(someCollection:=csvMap, someKey:=substringToSearchFor)
Next rowIndex
Sheet6.Cells(FIRST_ROW_CONTAINING_DATA, "E").Resize(UBound(arrayToWriteToSheet, 1), UBound(arrayToWriteToSheet, 2)) = arrayToWriteToSheet
End Sub
Private Function GetMapForCsv() As Collection
' This function tries to take advantage of which columns '
' in the CSV to look at -- and builds a collection in which: '
' • each collection key = the concatenation of columns that need to contain matches '
' • each collection item = the value that we later want to look up '
Const CSV_COLUMN_TO_WRITE_TO_COLUMN_E As Long = 1 ' First column'
Const CSV_COLUMN_CORRESPONDING_TO_COLUMN_A As Long = 2 ' Second column'
Const CSV_COLUMN_CORRESPONDING_TO_COLUMN_G As Long = 3 ' Third column'
Dim outputCollection As Collection
Set outputCollection = New Collection
Dim fileHandle As Long
fileHandle = FreeFile
Open CSV_FILE_PATH For Input As #fileHandle ' Open once outside of loop.'
Do Until EOF(fileHandle)
' File is probably too large to fit in memory, hence iterating line by line'
' but I think this assumes there are no escaped/quoted new line characters in the values themselves'
Dim currentLineInCsv As String
Line Input #fileHandle, currentLineInCsv
currentLineInCsv = UCase$(currentLineInCsv)
Dim currentValues() As String
currentValues = Split(currentLineInCsv, ",", -1, vbBinaryCompare)
Dim toJoin(0 To 1) As String
toJoin(0) = currentValues(CSV_COLUMN_CORRESPONDING_TO_COLUMN_A - 1) ' 0-based array assumed'
toJoin(1) = currentValues(CSV_COLUMN_CORRESPONDING_TO_COLUMN_G - 1) ' 0-based array assumed'
Dim collectionKey As String
collectionKey = Join$(toJoin, ",")
Dim collectionItem As String
collectionItem = currentValues(CSV_COLUMN_TO_WRITE_TO_COLUMN_E - 1) ' 0-based array assumed'
On Error Resume Next
outputCollection.Add Item:=collectionItem, key:=collectionKey
On Error GoTo 0
Loop
Close #fileHandle
Set GetMapForCsv = outputCollection
End Function
Private Function GetCollectionItemOrDefault(ByVal someCollection As Collection, ByVal someKey As String, Optional ByVal defaultValue As String = vbNullString) As String
' Returns the item corresponding to some key of a collection'
' If the key does not exist, returns the default.'
Dim keyDoesNotExist As Boolean
Dim itemToReturn As String
On Error Resume Next
itemToReturn = someCollection(someKey)
keyDoesNotExist = (Err.Number <> 0)
On Error GoTo 0
If keyDoesNotExist Then itemToReturn = defaultValue
GetCollectionItemOrDefault = itemToReturn
End Function
关于实现,您将需要更改这些行(在GetMapForCsv
函数的代码中):
Const CSV_COLUMN_TO_WRITE_TO_COLUMN_E As Long = 1 ' First column'
Const CSV_COLUMN_CORRESPONDING_TO_COLUMN_A As Long = 2 ' Second column'
Const CSV_COLUMN_CORRESPONDING_TO_COLUMN_G As Long = 3 ' Third column'
,以便您的CSV分配正确。当前,以上分配假定:
Sheet6
的A列中的值Sheet6
的G列中的值Sheet6
的E列中。我的CSV仅包含3列,我的Sheet6
包含1万个要查找的值;在我的机器上,上面的代码是:
替代方法可能包括:
使用Python。应该可以使用生成器表达式以节省内存的方式循环遍历CSV文件的200万行。我认为内置的csv
模块可以处理此任务,但是您可能还需要一些Python模块/库来与Excel进行交互,例如openpyxl
,XlsxWriter
(以便您可以读取列A和G中的值)。 Python代码可能会更短/更易于维护。
使用Power Query(这是Windows上Excel的一部分已经有好几年了)。您可能会将A列和G列读入一个表中,并将CSV读入另一个表中。输出可能会作为表格加载到工作表中。不确定性能/效率如何,但是值得以允许您使用Table.Merge
的方式设置数据。否则,也许您可能会退回到Text.Contains
之类。