使用MSXML2.XMLHTTP60对象解析HTML数据时出现错误91

时间:2019-01-07 05:04:20

标签: excel vba debugging web-scraping

我正在使用MSXML2.XMLHTTP60对象连接到网站,获取HTML并解析它以获取相关数据。本质上,这是一个网络爬虫。

进入第10个左右后,出现91错误。意味着该数据不存在于对象中。我结束程序,然后,当我重新启动时,在以前可以正常工作的另一个位置收到了此错误。我似乎无法摆脱它。后来我发现可以通过重新启动Excel来“解决”第二个91错误。

现在,对于第一个91错误,我发现当我逐步执行程序时,一切正常。然后,我在经过如此多的循环后添加了延迟。在第一个91错误的位置附近,延迟大约3秒钟。这似乎解决了。

我不太明白,为什么这种延误如此严重?我想消除这种延迟并使程序以最大速度运行。除非由于某些原因而绝对必要。我认为我不需要延迟才能使该功能正常工作。

这是完整的代码;标记发生错误的地方。

Option Explicit

Public baseLink As String
Public baseRow As String
Public basePageNav As String
Public rowNumber As Integer
Public someCounter As Integer
Public masterBrowser As New MSXML2.XMLHTTP60


Sub Initialization()
    baseLink = "https://www.fbo.gov/index.php"
    baseRow = "row_"
    basePageNav = "?s=opportunity&mode=list&tab=list&pageID="
    rowNumber = 2
    someCounter = 1
End Sub


Sub extractData(URLLink As String)
   ' Dim httpAddress As New MSXML2.XMLHTTP60
    Dim cResponse As String
   ' Set HTMLdoc = CreateObject("htmlfile")
    Dim HTMLdoc As New HTMLDocument
    Dim testString As String
    Dim closePos As Integer

    Dim titleString As String
    Dim solicitationNumber As String
    Dim agencyName As String
    Dim officeName As String
    Dim locationName As String
    Dim postedDate As String
    Dim responseDate As String
    Dim noticeString As String
    Dim classificationCode As String
    Dim NAICScode As String

    Dim temp As String

    With masterBrowser
        .Open "GET", URLLink, False
        .send
    End With

    With masterBrowser
        While Not .readyState = 4
            Application.Wait Now + TimeValue("0:00:01")
        Wend

        While Not .Status = 200
            Application.Wait Now + TimeValue("0:00:01")
        Wend

        cResponse = StrConv(.responseBody, vbUnicode)
    End With

    ' For some reason, there is a bug in the library. If the applciation runs
    ' too fast, then the program will eventually recieve a 91 error.
    ' This if statement will cause the program to delay by 3 secs every 9th
    ' entry in order to prevent the 91 error from appearing

'    If someCounter = 9 Then
 '       Application.Wait (Now + TimeValue("0:00:03"))
  '      someCounter = 1
   ' Else
    '    someCounter = someCounter + 1
    'End If

   ' cResponse = StrConv(httpAddress.responseBody, vbUnicode)
    cResponse = Mid$(cResponse, InStr(1, cResponse, "<!DOCTYPE "))

    With HTMLdoc
        .body.innerHTML = cResponse

        If .getElementsByClassName("agency-header")(0) Is Nothing Then 'If this is empty, then this will cause the first 91 error
            MsgBox ("It is empty")

            ' So, lets try another reconnect?

            Dim anotherAttemptB As String

            With masterBrowser
                .Open "GET", URLLink, False
                .send
                anotherAttemptB = StrConv(.responseBody, vbUnicode)
            End With

            anotherAttemptB = Mid$(anotherAttemptB, InStr(1, anotherAttemptB, "<!DOCTYPE "))
            .body.innerHTML = anotherAttemptB
            closePos = InStr(.getElementsByClassName("agency-header")(0).innerHTML, "/") ' 91 error caused here
        Else
            closePos = InStr(.getElementsByClassName("agency-header")(0).innerHTML, "/")
        End If

        titleString = Mid(.getElementsByClassName("agency-header")(0).innerHTML, 1, closePos)
        titleString = Replace(titleString, "<H2>", "")
        titleString = Replace(titleString, "</", "")

        testString = .getElementsByClassName("sol-num")(0).innerHTML
        solicitationNumber = Replace(.getElementsByClassName("sol-num")(0).innerHTML, "Solicitation Number: ", "")

        testString = .getElementsByClassName("agency-name")(0).innerHTML

        agencyName = Split(.getElementsByClassName("agency-name")(0).innerHTML, "<BR>")(0)
        agencyName = Replace(agencyName, "Agency: ", "")

        officeName = Split(.getElementsByClassName("agency-name")(0).innerHTML, "<BR>")(1)
        officeName = Replace(officeName, "Office: ", "")

        locationName = Split(.getElementsByClassName("agency-name")(0).innerHTML, "<BR>")(2)
        locationName = Replace(locationName, "Location: ", "")

        postedDate = .getElementById("dnf_class_values_procurement_notice__posted_date__widget").innerHTML

        responseDate = .getElementById("dnf_class_values_procurement_notice__response_deadline__widget").innerHTML
        responseDate = Replace(responseDate, "&nbsp;", " ")

        noticeString = .getElementById("dnf_class_values_procurement_notice__procurement_type__widget").innerHTML
        classificationCode = .getElementById("dnf_class_values_procurement_notice__classification_code__widget").innerHTML

        NAICScode = .getElementById("dnf_class_values_procurement_notice__naics_code__widget").innerHTML
        closePos = InStr(NAICScode, " ")
        NAICScode = Mid(NAICScode, 1, closePos)
    End With

    Worksheets("Sheet1").Range("A" & rowNumber).Value = titleString
    Worksheets("Sheet1").Range("B" & rowNumber).Value = solicitationNumber
    Worksheets("Sheet1").Range("C" & rowNumber).Value = agencyName
    Worksheets("Sheet1").Range("D" & rowNumber).Value = officeName
    Worksheets("Sheet1").Range("E" & rowNumber).Value = locationName
    Worksheets("Sheet1").Range("F" & rowNumber).Value = postedDate
    Worksheets("Sheet1").Range("G" & rowNumber).Value = responseDate
    Worksheets("Sheet1").Range("H" & rowNumber).Value = noticeString
    Worksheets("Sheet1").Range("I" & rowNumber).Value = classificationCode
    Worksheets("Sheet1").Range("J" & rowNumber).Value = NAICScode
    Worksheets("Sheet1").Range("K" & rowNumber).Value = URLLink

    rowNumber = rowNumber + 1

    Set HTMLdoc = Nothing

End Sub

Sub test()
    Initialization

    Dim linkArray(19) As String
    Dim xmlhttp As New MSXML2.XMLHTTP60
    Dim myUrl As String
    Dim i As Integer
    Dim sResponse As String
    'Set HTML = CreateObject("htmlfile")
    Dim HTML As New HTMLDocument

    Dim testStrin As String

    Worksheets("Sheet1").Range("A1").Value = "Title:"
    Worksheets("Sheet1").Range("B1").Value = "Solicitation #:"
    Worksheets("Sheet1").Range("C1").Value = "Agency:"
    Worksheets("Sheet1").Range("D1").Value = "Office:"
    Worksheets("Sheet1").Range("E1").Value = "Location:"
    Worksheets("Sheet1").Range("F1").Value = "Posted On:"
    Worksheets("Sheet1").Range("G1").Value = "Response Date:"
    Worksheets("Sheet1").Range("H1").Value = "Notice Type:"
    Worksheets("Sheet1").Range("I1").Value = "Classification Code:"
    Worksheets("Sheet1").Range("J1").Value = "NAICS Code:"
    Worksheets("Sheet1").Range("K1").Value = "Link:"

    myUrl = baseLink & basePageNav & "1"

    With masterBrowser
        .Open "GET", myUrl, False
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
    End With

'    sResponse = StrConv(xmlhttp.responseBody, vbUnicode)
    sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))

    With HTML
        .body.innerHTML = sResponse

        For i = 0 To 19
            Dim linkString As String
            Dim openPos As Integer
            Dim closePos As Integer

            If HTML.getElementById(baseRow & CStr(i)) Is Nothing Then ' To make matters even more confusing, after the first 91 error, here is the second until you restart excel
                MsgBox ("It is also empty")
                Dim anotherAttempt As String

                With masterBrowser
                    .Open "GET", myUrl, False
                    .send
                    anotherAttempt = StrConv(.responseBody, vbUnicode)
                End With

                anotherAttempt = Mid$(anotherAttempt, InStr(1, anotherAttempt, "<!DOCTYPE "))

                .body.innerHTML = anotherAttempt

                linkString = .getElementById(baseRow & CStr(i)).Children(0).innerHTML
            Else
                linkString = .getElementById(baseRow & CStr(i)).Children(0).innerHTML
            End If

            openPos = InStr(linkString, "?")
            closePos = InStr(linkString, "w") + 3

            linkString = Mid(linkString, openPos, closePos - openPos)
            linkString = Replace(linkString, "amp;", "")
            linkString = baseLink & linkString

            linkArray(i) = linkString
        Next i
    End With



    For i = 0 To 19
        extractData (linkArray(i))
    Next i

    Set HTML = Nothing

    MsgBox ("Finished")

End Sub

根据QHarr的要求,第一次出现91错误时的响应文本如下:

"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=8" />
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1

以及第二个91错误发生时的响应文本:

"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=8" />
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1

91错误之前的响应文本:

"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=8" />
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1

0 个答案:

没有答案