读取大文件并将其拆分为多个小文件的最佳方法

时间:2014-12-07 23:12:39

标签: file vbscript

我的XML文件大于100MB(超过20L行),我无法将其直接传递给我的其中一个进程。我需要通过假设一些分隔符拆分为多个较小的文件,我尝试在VBScript中使用FileSystemObject以及BATCH文件选项。两者都需要超过8分钟才能读取并创建第一个10000行的小文件。

请为此任务传达一些良好的绩效导向选项。

感谢您的帮助。

方法一:

Function SplitXML()
Dim oSrcFile, oTgtFile, strHeader, intFiles, strContent, intSize
Dim arrLines()
Set oFSO = CreateObject("Scripting.FileSystemObject")

strFilePath = Application.GetOpenFilename
intLinesToSplit = InputBox("Enter the No of Lines to split with for each file:", "XML Splitter", 10000)
strTgtPath = Replace(strFilePath, oFSO.GetFileName(strFilePath), "")
strFileName = Replace(oFSO.GetFileName(strFilePath), ".xml", "")
Set oSrcFile = oFSO.OpenTextFile(strFilePath)


'Headers
strHeader = "": strContent = ""
Do
    strTemp = oSrcFile.ReadLine
   strHeader = strHeader & vbNewLine & strTemp
Loop While InStr(1, strTemp, "</BLHeader>", 1) <= 0

'Split
intTemp = 0: intFiles = 0: blnNewFile = True: intSize = -1
Do While Not oSrcFile.AtEndOfStream
    intTemp = intTemp + 1

    'Content
    'intSize = intSize + 1
    'ReDim Preserve arrLines(intSize)
    'arrLines(intSize) = oSrcFile.ReadLine
    strTemp = oSrcFile.ReadLine
    strContent = strContent & vbNewLine & strTemp

    If intTemp >= intLinesToSplit Then
        If InStr(1, strTemp, "</EndingTag>", 1) > 0 Then
            'Add Header
            intFiles = intFiles + 1
            Set oTgtFile = oFSO.CreateTextFile(strTgtPath & strFileName & "_" & intFiles & ".xml", True)
            oTgtFile.WriteLine strHeader

            'Add Content
            oTgtFile.WriteLine strContent 'Join(arrLines, vbNewLine)

            'Add tail
            oTgtFile.WriteLine "</FinalFileTag>"
            oTgtFile.Close
        End If
    End If
Loop

oSrcFile.Close

结束功能

方法2:

@echo off
setlocal EnableDelayedExpansion

set InFile=c:\ee\EE28352646\in.txt
set OutDir=c:\ee\EE28352646
REM Can not be larger than 2147483648 !!!
set MaxLines=1000000

if not exist "%InFile%" (
  echo *ERROR* Input file does not exist!
  exit /b
)

if not exist "%OutDir%\" (
  echo *ERROR* Output folder does not exist!
  exit /b
)

for %%A in ("%InFile%") do (
  set Name=%%~nA
  set Ext=%%~xA
)

set /a Line=MaxLines+1
set File=0
for /f "usebackq tokens=*" %%A in ("%InFile%") do (
  set /a Line+=1
  if !Line! GTR %MaxLines% (
    set /a File+=1
    set OutFile=%OutDir%\%Name%_!File!%Ext%
    if exist "!OutFile!" del "!OutFile!"
    set Line=1
  )
  echo.%%A>>"!OutFile!"
)

1 个答案:

答案 0 :(得分:2)

拆分任何XML文件的最佳方法是 - 始终 - 使用实际的XML解析器来完成任务。假设您有一个这样的XML文件:

<?xml version="1.0"?>
<Foo>
  <Bar>some</Bar>
  <Bar>other</Bar>
</Foo>

并希望将其拆分为单独的文件

<?xml version="1.0"?>
<Foo>
  <Bar>some</Bar>
</Foo>

<?xml version="1.0"?>
<Foo>
  <Bar>other</Bar>
</Foo>
你可以这样做:

numFiles = 4  'number of output files

Set src = CreateObject("Msxml2.DOMDocument.6.0")
src.async = False
src.load "C:\path\to\input.xml"

Set nodes = src.selectNodes("//Bar")
numNodes = nodes.length \ numFiles  'number of nodes per output file

Set xml = Nothing
For i = 0 To nodes.length - 1
  'create a new XML object on the first iteration and every time numNodes
  'nodes have been added to the current object
  If i Mod numNodes = 0 Then
    If Not xml Is Nothing Then
      'if we already have an XML object: save it to a file
      Set prolog = xml.createProcessingInstruction("xml", "version='1.0'")
      xml.insertBefore prolog, xml.childNodes(0)
      xml.save "C:\path\to\output" & (i \ numNodes - 1) & ".xml"
    End If
    Set xml = CreateObject("Msxml2.DOMDocument.6.0")
    Set root = xml.createElement("Foo")
    xml.appendChild root
  End If
  root.appendChild nodes.item(i)
Next
'save unsaved XML object
If Not xml Is Nothing Then
  Set prolog = xml.createProcessingInstruction("xml", "version='1.0'")
  xml.insertBefore prolog, xml.childNodes(0)
  xml.save "C:\path\to\output" & (i \ numNodes - 1) & ".xml"
End If