使用VBA HTML从网页下载文件

时间:2013-06-20 22:28:48

标签: html excel vba

我几个月来一直在拼命地尝试自动化一个过程,在这个过程中,csv文件被下载,编辑并保存在给定的位置。 到目前为止,我只使用excel vba来打开网页并点击底部下载csv文件,但代码停止并需要手动干预才能完成,我希望它尽可能完全自动化。 看到使用的代码(我不是作者):

Sub WebDataExtraction()
Dim URL As String
Dim IeApp As Object
Dim IeDoc As Object
Dim ieForm As Object
Dim ieObj As Object
Dim objColl As Collection

URL = "http://www.bmreports.com/bsp/BMRSSystemData.php?pT=DDAD&zT=N&dT=NRT"

Set IeApp = CreateObject("InternetExplorer.Application")
IeApp.Visible = True
IeApp.Navigate URL

Do Until IeApp.ReadyState = READYSTATE_COMPLETE
Loop

Set IeDoc = IeApp.Document
For Each ele In IeApp.Document.getElementsByTagName("span")

If ele.innerHTML = "CSV" Then
Application.Wait (Now + TimeValue("0:00:15"))
DoEvents
ele.Click
'At this point you need to Save the document manually
' or figure out for yourself how to automate this interaction.
Test_Save_As_Set_Filename
File_Download_Click_Save
End If

Next

IeApp.Quit
End Sub"

提前致谢

南兹奥

2 个答案:

答案 0 :(得分:1)

我发布了第二个答案,因为我认为我的第一个答案对于许多类似的应用程序来说已经足够了,因此在这种情况下它不起作用。

为什么其他方法失败:

  • .Click方法:这会引发一个新窗口,需要在运行时输入用户,似乎不可能使用WinAPI来控制此窗口。或者,至少我不能确定任何方式。代码执行在.Click行停止,直到用户手动干预,无法使用GoToWait或任何其他方法来规避此行为。
  • 使用WinAPI函数直接下载源文件不起作用,因为按钮的URL不包含文件,而是包含动态提供文件的js函数。

以下是我建议的解决方案解决方案:

您可以阅读网页的.body.InnerText,使用FileSystemObject将其写入纯文本/ csv文件,然后使用Regular Expressions和字符串操作的组合,将数据解析为正确分隔的CSV文件。

Sub WebDataExtraction()
    Dim url As String
    Dim fName As String
    Dim lnText As String
    Dim varLine() As Variant
    Dim vLn As Variant
    Dim newText As String
    Dim leftText As String
    Dim breakTime As Date
'## Requires reference to Microsoft VBScript Regular Expressions 5.5
    Dim REMatches As MatchCollection
    Dim m As Match
'## Requires reference to Microsoft Internet Controls
    Dim IeApp As InternetExplorer
'## Requires reference to Microsoft HTML object library
    Dim IeDoc As HTMLDocument
    Dim ele As HTMLFormElement
'## Requires reference to Microsoft Scripting Runtime
    Dim fso As FileSystemObject
    Dim f As TextStream
    Dim ln As Long: ln = 1


    breakTime = DateAdd("s", 60, Now)
    url = "http://www.bmreports.com/bsp/BMRSSystemData.php?pT=DDAD&zT=N&dT=NRT"
    Set IeApp = CreateObject("InternetExplorer.Application")

    With IeApp
        .Visible = True
        .Navigate url

        Do Until .ReadyState = 4
        Loop

        Set IeDoc = .Document
    End With
    'Wait for the data to display on the page
    Do
        If Now >= breakTime Then
            If MsgBox("The website is taking longer than usual, would you like to continue waiting?", vbYesNo) = vbNo Then
                GoTo EarlyExit
            Else:
                breakTime = DateAdd("s", 60, Now)
            End If
        End If
    Loop While Trim(IeDoc.body.innerText) = "XML CSV Please Wait Data Loading Sorting"

    '## Create the text file
    fName = ActiveWorkbook.Path & "\exported-csv.csv"
    Set fso = CreateObject("Scripting.FileSystemObject")
    Set f = fso.OpenTextFile(fName, 2, True, -1)
    f.Write IeDoc.body.innerText
    f.Close
    Set f = Nothing

    '## Read the text file
    Set f = fso.OpenTextFile(fName, 1, False, -1)
    Do
        lnText = f.ReadLine
        '## The data starts on the 4th line in the InnerText.
        If ln >= 4 Then
            '## Return a collection of matching date/timestamps to which we can parse
            Set REMatches = SplitLine(lnText)
            newText = lnText
            For Each m In REMatches
                newText = Replace(newText, m.Value, ("," & m.Value & ","), , -1, vbTextCompare)
            Next
            '## Get rid of consecutive delimiters:
            Do
                newText = Replace(newText, ",,", ",")
            Loop While InStr(1, newText, ",,", vbBinaryCompare) <> 0
            '## Then use some string manipulation to parse out the first 2 columns which are
            '   not a match to the RegExp we used above.
            leftText = Left(newText, InStr(1, newText, ",", vbTextCompare) - 1)
            leftText = Left(leftText, 10) & "," & Right(leftText, Len(leftText) - 10)
            newText = Right(newText, Len(newText) - InStr(1, newText, ",", vbTextCompare))
            newText = leftText & "," & newText

            '## Store these lines in an array
            ReDim Preserve varLine(ln - 4)
            varLine(ln - 4) = newText
        End If
        ln = ln + 1

    Loop While Not f.AtEndOfStream
    f.Close

'## Re-open the file for writing the delimited lines:
    Set f = fso.OpenTextFile(fName, 2, True, -1)
    '## Iterate over the array and write the data in CSV:
    For Each vLn In varLine
        'Omit blank lines, if any.
        If Len(vLn) <> 0 Then f.WriteLine vLn
    Next
    f.Close

EarlyExit:
    Set fso = Nothing
    Set f = Nothing
    IeApp.Quit
    Set IeApp = Nothing

End Sub

Function SplitLine(strLine As String) As MatchCollection
'returns a RegExp MatchCollection of Date/Timestamps found in each line
'## Requires reference to Microsoft VBScript Regular Expressions 5.5
Dim RE As RegExp
Dim matches As MatchCollection
    Set RE = CreateObject("vbscript.regexp")
    With RE
        .MultiLine = False
        .Global = True
        .IgnoreCase = True
        '## Use this RegEx pattern to parse the date & timestamps:
        .Pattern = "(19|20)\d\d[-](0[1-9]|1[012])[-](0[1-9]|[12][0-9]|3[01])[ ]\d\d?:\d\d:\d\d"
    End With
    Set matches = RE.Execute(strLine)
    Set SplitLine = matches
End Function

答案 1 :(得分:0)

修改

我使用网址测试了原始答案代码:

http://www.bmreports.com/bsp/BMRSSystemData.php?pT=DDAD&zT=N&dT=NRT#saveasCSV

但是对于这个网站来说,这种方法似乎不起作用。 ele.Click似乎没有启动下载,只是打开网页上的数据表格。要下载,您需要右键单击/另存为。如果你已经走得那么远(我怀疑,基于你正在调用的子程序,但你没有提供代码),那么你可以使用Win API来获取Save对话框的HWND并可能自动化事件。 Santosh提供了一些相关信息:

VBA - Go to website and download file from save prompt

这也是一个有助于解决问题的好资源:

http://social.msdn.microsoft.com/Forums/en-US/beb6fa0e-fbc8-49df-9f2e-30f85d941fad/download-file-from-ie-with-vba

原始答案

如果您能够确定CSV的URL,则可以使用此子例程打开与CSV数据的连接,并将其直接导入工作簿。您可能需要对导入的数据自动执行文本到列操作,但可以使用宏录制器轻松复制。我在下面的Test()子例程中给出了一个例子。

您可以轻松修改此项以在新工作簿中添加QueryTables,然后自动执行该工作簿上的SaveAs方法以将文件另存为CSV。

此示例使用Yahoo Finance,Ford Motor Company的已知URL,并将在活动工作表的单元格QueryTables中添加带有CSV数据的A1。这可以很容易地修改,以将其放在另一个工作表,另一个工作簿等。

Sub Test()
Dim MyURL as String
MyURL = "http://ichart.finance.yahoo.com/table.csv?s=GM&a0&b=1&c2010&d=05&e=20&f=2013&g=d&ignore=.csv"

OpenURL MyURL

'Explode the CSV data:
Range("A:A").TextToColumns Destination:=Range("A1"), DataType:=xlDelimited, _
    TextQualifier:=xlDoubleQuote, ConsecutiveDelimiter:=False, Tab:=False, _
    Semicolon:=False, Comma:=True, Space:=False, Other:=False, FieldInfo _
    :=Array(Array(1, 3), Array(2, 1), Array(3, 1), Array(4, 1), Array(5, 1), Array(6, 1), _
    Array(7, 1)), TrailingMinusNumbers:=True

End Sub

Private Sub OpenURL(fullURL As String)

'This opens the CSV in querytables connection.
On Error GoTo ErrOpenURL
    With ActiveSheet.QueryTables.Add(Connection:= _
        "URL;" & fullURL, Destination:=Range("A1"))
        .Name = fullURL
        .FieldNames = True
        .RowNumbers = False
        .FillAdjacentFormulas = False
        .PreserveFormatting = True
        .RefreshOnFileOpen = False
        .BackgroundQuery = True
        .RefreshStyle = xlInsertDeleteCells
        .SavePassword = True
        .SaveData = True
        .AdjustColumnWidth = True
        .RefreshPeriod = 0
        .WebSelectionType = xlEntirePage
        .WebFormatting = xlWebFormattingAll
        .WebPreFormattedTextToColumns = True
        .WebConsecutiveDelimitersAsOne = True
        .WebSingleBlockTextImport = False
        .WebDisableDateRecognition = False
        .WebDisableRedirections = False
        .Refresh BackgroundQuery:=False
    End With

ExitOpenURL:
Exit Sub 'if all goes well, you can exit

'Error handling...

ErrOpenURL:
Err.Clear
bCancel = True
Resume ExitOpenURL


End Sub