我有一个问题,我没有取得任何进展。
我正在研究我的硕士论文。为此,我有一个Actors列表,需要检查哪些有自己的(德国)维基百科页面。 (约20,000名演员) 由于我在vba编程方面不是很有经验,所以我在论坛中寻找解决方案。 我找到了一个代码,您可以通过谷歌搜索网址,并将第一个结果复制到Excel中。
Using VBA in Excel to Google Search in IE and return the hyperlink of the first result
我试图通过谷歌搜索德语页面来限制搜索到德国维基百科。例如。 “site:de.wikipedia.org intitle:johnny depp”
对于已知的actor来说这很好,但是当我搜索没有自己的页面的actor时,我得到了一个错误代码。 “错误91:对象变量或未设置块变量”
那么你是否可以帮助我在代码中构建一个解决方法,当他/她没有自己的页面时跳过演员并且插入列表中的下一个继续插入?
对于noobie问题感到抱歉,但那会很棒! :) 或者你甚至可以采用更简单的解决方案。
非常感谢你!
Sub XMLHTTP()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object
Dim start_time As Date
Dim end_time As Date
lastRow = Range("A" & Rows.Count).End(xlUp).Row
Dim cookie As String
Dim result_cookie As String
start_time = Time
Debug.Print "start_time:" & start_time
For i = 2 To lastRow
url = "https://www.google.de/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)
Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP")
XMLHTTP.Open "GET", url, False
XMLHTTP.setRequestHeader "Content-Type", "text/xml"
XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
XMLHTTP.send
Set html = CreateObject("htmlfile")
html.body.innerHTML = XMLHTTP.ResponseText
Set objResultDiv = html.getelementbyid("rso")
Set objH3 = objResultDiv.getelementsbytagname("H3")(0)
Set link = objH3.getelementsbytagname("a")(0)
str_text = Replace(link.innerHTML, "<EM>", "")
str_text = Replace(str_text, "</EM>", "")
Cells(i, 2) = str_text
Cells(i, 3) = link.href
DoEvents
Next
end_time = Time
Debug.Print "end_time:" & end_time
Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time)
MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub
答案 0 :(得分:0)
检查是否找到objResultDiv
元素,如果找到,继续进一步向单元格写入“Not Found”。
您可以尝试这样的事情......
Sub XMLHTTP()
Dim url As String, lastRow As Long
Dim XMLHTTP As Object, html As Object, objResultDiv As Object, objH3 As Object, link As Object
Dim start_time As Date
Dim end_time As Date
Dim i As Long
Dim str_text As String
lastRow = Range("A" & Rows.Count).End(xlUp).Row
Dim cookie As String
Dim result_cookie As String
start_time = Time
Debug.Print "start_time:" & start_time
For i = 2 To lastRow
url = "https://www.google.de/search?q=" & Cells(i, 1) & "&rnd=" & WorksheetFunction.RandBetween(1, 10000)
Set XMLHTTP = CreateObject("MSXML2.serverXMLHTTP")
XMLHTTP.Open "GET", url, False
XMLHTTP.setRequestHeader "Content-Type", "text/xml"
XMLHTTP.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
XMLHTTP.send
If XMLHTTP.Status = 200 Then
Set html = CreateObject("htmlfile")
html.body.innerHTML = XMLHTTP.ResponseText
Set objResultDiv = html.getelementbyid("rso")
If Not objResultDiv Is Nothing Then
Set objH3 = objResultDiv.getelementsbytagname("H3")(0)
Set link = objH3.getelementsbytagname("a")(0)
str_text = Replace(link.innerHTML, "<EM>", "")
str_text = Replace(str_text, "</EM>", "")
Cells(i, 2) = str_text
Cells(i, 3) = link.href
DoEvents
Else
Cells(i, 2) = "Not Found"
Cells(i, 3) = "Not Found"
End If
Else
Cells(i, 2) = "Not Found"
Cells(i, 3) = "Not Found"
End If
Next
end_time = Time
Debug.Print "end_time:" & end_time
Debug.Print "done" & "Time taken : " & DateDiff("n", start_time, end_time)
MsgBox "done" & "Time taken : " & DateDiff("n", start_time, end_time)
End Sub
答案 1 :(得分:0)
有时使用xmlhttp
,serverxmlhttp
或winhttp
请求从Google抓取信息很困难。即使您尝试使用proxy
,Google也可以轻松地将您检测为机器人,因此它会引导您进入captcha
页面,您的尝试将会失败。但是,在这种情况下更安全的方法是试用IE。请尝试以下方式。如果你有IE9,那么刮刀中定义的.querySelector()
将摇摆不定。
Sub ScrapeGoogle()
Dim IE As New InternetExplorer, HTML As HTMLDocument
Dim cel As Range, URL$, post As Object
For Each cel In Range("A2:A" & Cells(Rows.Count, 1).End(xlUp).row)
URL = "https://www.google.de/search?q=" & Replace(cel.Value, " ", "%20")
With IE
.Visible = True
.navigate URL
While .Busy = True Or .readyState <> 4: DoEvents: Wend
Set HTML = .document
If Not HTML.querySelector(".rc h3.r a") Is Nothing Then
Set post = HTML.querySelector(".rc h3.r a")
cel(1, 2) = post.innerText
cel(1, 3) = post.getAttribute("href")
Else
cel(1, 2) = "Nothing found"
cel(1, 3) = "Sorry dear"
End If
End With
Next cel
IE.Quit
End Sub
参考添加到库:
Microsoft Internet Controls
Microsoft HTML Object Library