我正在使用MSXML2.XMLHTTP60
对象连接到网站,获取HTML并解析它以获取相关数据。本质上,这是一个网络爬虫。
进入第10个左右后,出现91错误。意味着该数据不存在于对象中。我结束程序,然后,当我重新启动时,在以前可以正常工作的另一个位置收到了此错误。我似乎无法摆脱它。后来我发现可以通过重新启动Excel来“解决”第二个91错误。
现在,对于第一个91错误,我发现当我逐步执行程序时,一切正常。然后,我在经过如此多的循环后添加了延迟。在第一个91错误的位置附近,延迟大约3秒钟。这似乎解决了。
我不太明白,为什么这种延误如此严重?我想消除这种延迟并使程序以最大速度运行。除非由于某些原因而绝对必要。我认为我不需要延迟才能使该功能正常工作。
这是完整的代码;标记发生错误的地方。
Option Explicit
Public baseLink As String
Public baseRow As String
Public basePageNav As String
Public rowNumber As Integer
Public someCounter As Integer
Public masterBrowser As New MSXML2.XMLHTTP60
Sub Initialization()
baseLink = "https://www.fbo.gov/index.php"
baseRow = "row_"
basePageNav = "?s=opportunity&mode=list&tab=list&pageID="
rowNumber = 2
someCounter = 1
End Sub
Sub extractData(URLLink As String)
' Dim httpAddress As New MSXML2.XMLHTTP60
Dim cResponse As String
' Set HTMLdoc = CreateObject("htmlfile")
Dim HTMLdoc As New HTMLDocument
Dim testString As String
Dim closePos As Integer
Dim titleString As String
Dim solicitationNumber As String
Dim agencyName As String
Dim officeName As String
Dim locationName As String
Dim postedDate As String
Dim responseDate As String
Dim noticeString As String
Dim classificationCode As String
Dim NAICScode As String
Dim temp As String
With masterBrowser
.Open "GET", URLLink, False
.send
End With
With masterBrowser
While Not .readyState = 4
Application.Wait Now + TimeValue("0:00:01")
Wend
While Not .Status = 200
Application.Wait Now + TimeValue("0:00:01")
Wend
cResponse = StrConv(.responseBody, vbUnicode)
End With
' For some reason, there is a bug in the library. If the applciation runs
' too fast, then the program will eventually recieve a 91 error.
' This if statement will cause the program to delay by 3 secs every 9th
' entry in order to prevent the 91 error from appearing
' If someCounter = 9 Then
' Application.Wait (Now + TimeValue("0:00:03"))
' someCounter = 1
' Else
' someCounter = someCounter + 1
'End If
' cResponse = StrConv(httpAddress.responseBody, vbUnicode)
cResponse = Mid$(cResponse, InStr(1, cResponse, "<!DOCTYPE "))
With HTMLdoc
.body.innerHTML = cResponse
If .getElementsByClassName("agency-header")(0) Is Nothing Then 'If this is empty, then this will cause the first 91 error
MsgBox ("It is empty")
' So, lets try another reconnect?
Dim anotherAttemptB As String
With masterBrowser
.Open "GET", URLLink, False
.send
anotherAttemptB = StrConv(.responseBody, vbUnicode)
End With
anotherAttemptB = Mid$(anotherAttemptB, InStr(1, anotherAttemptB, "<!DOCTYPE "))
.body.innerHTML = anotherAttemptB
closePos = InStr(.getElementsByClassName("agency-header")(0).innerHTML, "/") ' 91 error caused here
Else
closePos = InStr(.getElementsByClassName("agency-header")(0).innerHTML, "/")
End If
titleString = Mid(.getElementsByClassName("agency-header")(0).innerHTML, 1, closePos)
titleString = Replace(titleString, "<H2>", "")
titleString = Replace(titleString, "</", "")
testString = .getElementsByClassName("sol-num")(0).innerHTML
solicitationNumber = Replace(.getElementsByClassName("sol-num")(0).innerHTML, "Solicitation Number: ", "")
testString = .getElementsByClassName("agency-name")(0).innerHTML
agencyName = Split(.getElementsByClassName("agency-name")(0).innerHTML, "<BR>")(0)
agencyName = Replace(agencyName, "Agency: ", "")
officeName = Split(.getElementsByClassName("agency-name")(0).innerHTML, "<BR>")(1)
officeName = Replace(officeName, "Office: ", "")
locationName = Split(.getElementsByClassName("agency-name")(0).innerHTML, "<BR>")(2)
locationName = Replace(locationName, "Location: ", "")
postedDate = .getElementById("dnf_class_values_procurement_notice__posted_date__widget").innerHTML
responseDate = .getElementById("dnf_class_values_procurement_notice__response_deadline__widget").innerHTML
responseDate = Replace(responseDate, " ", " ")
noticeString = .getElementById("dnf_class_values_procurement_notice__procurement_type__widget").innerHTML
classificationCode = .getElementById("dnf_class_values_procurement_notice__classification_code__widget").innerHTML
NAICScode = .getElementById("dnf_class_values_procurement_notice__naics_code__widget").innerHTML
closePos = InStr(NAICScode, " ")
NAICScode = Mid(NAICScode, 1, closePos)
End With
Worksheets("Sheet1").Range("A" & rowNumber).Value = titleString
Worksheets("Sheet1").Range("B" & rowNumber).Value = solicitationNumber
Worksheets("Sheet1").Range("C" & rowNumber).Value = agencyName
Worksheets("Sheet1").Range("D" & rowNumber).Value = officeName
Worksheets("Sheet1").Range("E" & rowNumber).Value = locationName
Worksheets("Sheet1").Range("F" & rowNumber).Value = postedDate
Worksheets("Sheet1").Range("G" & rowNumber).Value = responseDate
Worksheets("Sheet1").Range("H" & rowNumber).Value = noticeString
Worksheets("Sheet1").Range("I" & rowNumber).Value = classificationCode
Worksheets("Sheet1").Range("J" & rowNumber).Value = NAICScode
Worksheets("Sheet1").Range("K" & rowNumber).Value = URLLink
rowNumber = rowNumber + 1
Set HTMLdoc = Nothing
End Sub
Sub test()
Initialization
Dim linkArray(19) As String
Dim xmlhttp As New MSXML2.XMLHTTP60
Dim myUrl As String
Dim i As Integer
Dim sResponse As String
'Set HTML = CreateObject("htmlfile")
Dim HTML As New HTMLDocument
Dim testStrin As String
Worksheets("Sheet1").Range("A1").Value = "Title:"
Worksheets("Sheet1").Range("B1").Value = "Solicitation #:"
Worksheets("Sheet1").Range("C1").Value = "Agency:"
Worksheets("Sheet1").Range("D1").Value = "Office:"
Worksheets("Sheet1").Range("E1").Value = "Location:"
Worksheets("Sheet1").Range("F1").Value = "Posted On:"
Worksheets("Sheet1").Range("G1").Value = "Response Date:"
Worksheets("Sheet1").Range("H1").Value = "Notice Type:"
Worksheets("Sheet1").Range("I1").Value = "Classification Code:"
Worksheets("Sheet1").Range("J1").Value = "NAICS Code:"
Worksheets("Sheet1").Range("K1").Value = "Link:"
myUrl = baseLink & basePageNav & "1"
With masterBrowser
.Open "GET", myUrl, False
.send
sResponse = StrConv(.responseBody, vbUnicode)
End With
' sResponse = StrConv(xmlhttp.responseBody, vbUnicode)
sResponse = Mid$(sResponse, InStr(1, sResponse, "<!DOCTYPE "))
With HTML
.body.innerHTML = sResponse
For i = 0 To 19
Dim linkString As String
Dim openPos As Integer
Dim closePos As Integer
If HTML.getElementById(baseRow & CStr(i)) Is Nothing Then ' To make matters even more confusing, after the first 91 error, here is the second until you restart excel
MsgBox ("It is also empty")
Dim anotherAttempt As String
With masterBrowser
.Open "GET", myUrl, False
.send
anotherAttempt = StrConv(.responseBody, vbUnicode)
End With
anotherAttempt = Mid$(anotherAttempt, InStr(1, anotherAttempt, "<!DOCTYPE "))
.body.innerHTML = anotherAttempt
linkString = .getElementById(baseRow & CStr(i)).Children(0).innerHTML
Else
linkString = .getElementById(baseRow & CStr(i)).Children(0).innerHTML
End If
openPos = InStr(linkString, "?")
closePos = InStr(linkString, "w") + 3
linkString = Mid(linkString, openPos, closePos - openPos)
linkString = Replace(linkString, "amp;", "")
linkString = baseLink & linkString
linkArray(i) = linkString
Next i
End With
For i = 0 To 19
extractData (linkArray(i))
Next i
Set HTML = Nothing
MsgBox ("Finished")
End Sub
根据QHarr的要求,第一次出现91错误时的响应文本如下:
"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=8" />
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1
以及第二个91错误发生时的响应文本:
"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=8" />
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1
91错误之前的响应文本:
"<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html lang="en">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=8" />
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1