读取大文本文件会使Excel冻结,可以使用Python完成吗?

时间:2019-05-08 20:20:40

标签: python excel vba

我正在通过SQL导入代码通过Excel VBA将数据导入sheet6。然后,我遍历导入数据的每一行,以基于两个条件寻找匹配项。带有200万行数据的大型文本文件的一行中存在A列和G列的值。如果找到匹配项,则将逗号分隔的文本文件中的第一个值添加到E列。

我曾尝试研究Excel VBA代码,但是在循环访问5000行数据时,这确实很慢。最多可能需要30分钟才能运行。希望Python解决方案可能是答案。

Const strFileName = "T:\Hex\ModeS-Mil.txt"

Sub FillMTextFile()
Dim StartTime As Double
Dim SecondsElapsed As Double
Dim strsearch As String
Dim MReg As String
Dim MType As String
Dim strLine As String
Dim f As Integer
Dim lngLine As Long
Dim blnFound As Boolean
Dim x As Long
Dim lrow As Long
lrow = Sheet6.Range("A" & Rows.Count).End(xlUp).Row
StartTime = Timer
For x = 2 To 3000
MReg = Sheet6.Range("A" & x).Value
MType = Sheet6.Range("G" & x).Value
strsearch = MReg & "," & MType
f = FreeFile
Open strFileName For Input As #f
Do While Not EOF(f)
lngLine = lngLine + 1
Line Input #f, strLine
If InStr(1, strLine, strsearch, vbBinaryCompare) > 0 Then
Sheet6.Range("E" & x).Value = UCase(Split(strLine, ",")(0))
On Error GoTo err
blnFound = True
Exit Do
End If
Loop
Close #f
If Not blnFound Then
End If
err:
Next x
SecondsElapsed = Round(Timer - StartTime, 2)
MsgBox "code time " & SecondsElapsed & " seconds", vbInformation
End Sub

1 个答案:

答案 0 :(得分:0)

此方法是否有效取决于以下所有条件是否正确:

  • 您可以预先指定CSV中的哪些列对应于Sheet6的A和G列
  • 您可以预先指定要写入CSV的哪一列到Sheet6的E列
  • CSV中的值本身都不包含任何换行符或逗号。

我希望一切都有意义。 ^

Option Explicit

Const CSV_FILE_PATH As String = "T:\Hex\ModeS-Mil.txt"
Const FIRST_ROW_CONTAINING_DATA As Long = 2 ' Exclude header row'

Sub FillMTextFile()

    Dim csvMap As Collection
    Set csvMap = GetMapForCsv()

    Dim lastRow As Long
    lastRow = Sheet6.Cells(Sheet6.Rows.Count, "A").End(xlUp).Row
    Debug.Assert lastRow >= FIRST_ROW_CONTAINING_DATA

    Dim substringsInColumnA() As Variant ' This needs a better name, especially if your columns change.'
    substringsInColumnA = Sheet6.Range("A" & FIRST_ROW_CONTAINING_DATA, "A" & lastRow).Value

    Dim substringsInColumnG() As Variant ' This needs a better name, especially if your columns change.'
    substringsInColumnG = Sheet6.Range("G" & FIRST_ROW_CONTAINING_DATA, "G" & lastRow).Value

    Dim outputRowCount As Long
    outputRowCount = lastRow - FIRST_ROW_CONTAINING_DATA + 1

    Dim arrayToWriteToSheet() As String
    ReDim arrayToWriteToSheet(1 To outputRowCount, 1 To 1)
    Debug.Assert LBound(arrayToWriteToSheet, 1) = LBound(substringsInColumnA, 1)

    Dim rowIndex As Long
    For rowIndex = LBound(substringsInColumnA, 1) To UBound(substringsInColumnA, 1)

        Dim substringsToJoin(0 To 1) As String ' Static, re-use every iteration'
        substringsToJoin(0) = UCase$(CStr(substringsInColumnA(rowIndex, 1)))
        substringsToJoin(1) = UCase$(CStr(substringsInColumnG(rowIndex, 1)))

        Dim substringToSearchFor As String
        substringToSearchFor = Join$(substringsToJoin, ",")

        arrayToWriteToSheet(rowIndex, 1) = _
            GetCollectionItemOrDefault(someCollection:=csvMap, someKey:=substringToSearchFor)

    Next rowIndex

    Sheet6.Cells(FIRST_ROW_CONTAINING_DATA, "E").Resize(UBound(arrayToWriteToSheet, 1), UBound(arrayToWriteToSheet, 2)) = arrayToWriteToSheet
End Sub

Private Function GetMapForCsv() As Collection
    ' This function tries to take advantage of which columns '
    ' in the CSV to look at -- and builds a collection in which: '
    '   • each collection key = the concatenation of columns that need to contain matches '
    '   • each collection item = the value that we later want to look up '

    Const CSV_COLUMN_TO_WRITE_TO_COLUMN_E As Long = 1 ' First column'
    Const CSV_COLUMN_CORRESPONDING_TO_COLUMN_A As Long = 2 ' Second column'
    Const CSV_COLUMN_CORRESPONDING_TO_COLUMN_G As Long = 3 ' Third column'

    Dim outputCollection As Collection
    Set outputCollection = New Collection

    Dim fileHandle As Long
    fileHandle = FreeFile

    Open CSV_FILE_PATH For Input As #fileHandle ' Open once outside of loop.'
    Do Until EOF(fileHandle)

        ' File is probably too large to fit in memory, hence iterating line by line'
        ' but I think this assumes there are no escaped/quoted new line characters in the values themselves'
        Dim currentLineInCsv As String
        Line Input #fileHandle, currentLineInCsv
        currentLineInCsv = UCase$(currentLineInCsv)

        Dim currentValues() As String
        currentValues = Split(currentLineInCsv, ",", -1, vbBinaryCompare)

        Dim toJoin(0 To 1) As String
        toJoin(0) = currentValues(CSV_COLUMN_CORRESPONDING_TO_COLUMN_A - 1) ' 0-based array assumed'
        toJoin(1) = currentValues(CSV_COLUMN_CORRESPONDING_TO_COLUMN_G - 1) ' 0-based array assumed'

        Dim collectionKey As String
        collectionKey = Join$(toJoin, ",")

        Dim collectionItem As String
        collectionItem = currentValues(CSV_COLUMN_TO_WRITE_TO_COLUMN_E - 1) ' 0-based array assumed'

        On Error Resume Next
        outputCollection.Add Item:=collectionItem, key:=collectionKey
        On Error GoTo 0
    Loop
    Close #fileHandle

    Set GetMapForCsv = outputCollection
End Function

Private Function GetCollectionItemOrDefault(ByVal someCollection As Collection, ByVal someKey As String, Optional ByVal defaultValue As String = vbNullString) As String
    ' Returns the item corresponding to some key of a collection'
    ' If the key does not exist, returns the default.'

    Dim keyDoesNotExist As Boolean
    Dim itemToReturn As String

    On Error Resume Next
    itemToReturn = someCollection(someKey)
    keyDoesNotExist = (Err.Number <> 0)
    On Error GoTo 0

    If keyDoesNotExist Then itemToReturn = defaultValue
    GetCollectionItemOrDefault = itemToReturn
End Function

关于实现,您将需要更改这些行(在GetMapForCsv函数的代码中):

Const CSV_COLUMN_TO_WRITE_TO_COLUMN_E As Long = 1 ' First column'
Const CSV_COLUMN_CORRESPONDING_TO_COLUMN_A As Long = 2 ' Second column'
Const CSV_COLUMN_CORRESPONDING_TO_COLUMN_G As Long = 3 ' Third column'

,以便您的CSV分配正确。当前,以上分配假定:

  • CSV的第二列对应于Sheet6的A列中的值
  • CSV中的第三列对应于Sheet6的G列中的值
  • 您要在CSV的第一列中查找值并将其写入Sheet6的E列中。

我的CSV仅包含3列,我的Sheet6包含1万个要查找的值;在我的机器上,上面的代码是:

  • 〜1000万行CSV中约195.6秒
  • 〜200万行CSV中约35.2秒

替代方法可能包括:

  • 使用Python。应该可以使用生成器表达式以节省内存的方式循环遍历CSV文件的200万行。我认为内置的csv模块可以处理此任务,但是您可能还需要一些Python模块/库来与Excel进行交互,例如openpyxlXlsxWriter(以便您可以读取列A和G中的值)。 Python代码可能会更短/更易于维护。

  • 使用Power Query(这是Windows上Excel的一部分已经有好几年了)。您可能会将A列和G列读入一个表中,并将CSV读入另一个表中。输出可能会作为表格加载到工作表中。不确定性能/效率如何,但是值得以允许您使用Table.Merge的方式设置数据。否则,也许您可​​能会退回到Text.Contains之类。