无法使用 URL-copy&paste 打开此 URL,它仅显示错误。我曾尝试通过 MSXML2.XMLHTTP 访问,但没有成功,结果一样!
这是我的代码:
Sub GetDataWebsite()
Const URL = "http://Zvg-port.de/index.php"
Dim HTML As New HTMLDocument
Dim elmt As Object
Dim x As Long
With CreateObject("MSXML2.XMLHTTP")
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send
HTML.body.innerHTML = .responseText
End With
Set elmt = HTML.querySelectorAll("tr") 'or any class or tag or name
For x = 0 To elmt.Length - 1
ActiveSheet.Cells(x + 2, 2) = elmt.Item(x).innerText
Next x
结束子
什么都没发生!可能是什么问题呢?谢谢!
答案 0 :(得分:1)
对于您的第一个问题,您需要在初始请求正文中添加其他参数。
奇怪的是,如果要使用 html
,就必须继续使用 querySelectorAll()
,而不是将任何内容设置为派生变量。
对于您的第二个问题,预计在来自搜索页面后会导航到结果网址。一些测试表明需要一个 referer
标头。我知道一个带有/不带 html 会话、带有 referer
标头的请求会起作用,因为我用 Python 进行了测试,但我还没有弄清楚 VBA 缺少什么;我目前的尝试返回了看起来也被截断的奇怪编码。
目前,我看到的最简单的方法,如果坚持使用 VBA,要确保遵循链接,将自动化浏览器,收集结果和结果链接,然后导航到每个链接。< /p>
当前代码(回答您的第一个问题):
Option Explicit
Public Sub GetDataZvgPort()
Const URL = "https://www.zvg-portal.de/index.php?button=Suchen"
Dim html As MSHTML.HTMLDocument, xhr As Object
Set html = New MSHTML.HTMLDocument
Set xhr = CreateObject("MSXML2.ServerXMLHTTP.6.0")
Dim headers As Variant
With xhr
.Open "POST", URL, False
.setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
.send "land_abk=sh&ger_name=Norderstedt&order_by=2&ger_id=X1526"
headers = .getAllResponseHeaders
html.body.innerHTML = .responseText
End With
Dim x As Long, link As String, gatheredLinks()
ReDim gatheredLinks(html.querySelectorAll("td a").Length - 1)
For x = 0 To html.querySelectorAll("table a nobr").Length - 1
ActiveSheet.Cells(x + 2, 2) = html.querySelectorAll("table a nobr").Item(x).innerText
link = Replace$(html.querySelectorAll("td a").Item(x).href, "about:", "https://www.zvg-portal.de/")
ActiveSheet.Cells(x + 2, 3) = link
Dim j As Long
For j = 0 To html.querySelectorAll("tr").Length - 1
If InStr(html.querySelectorAll("tr").Item(j).innerHTML, "Amtsgericht") > 0 Then
ActiveSheet.Cells(x + 2, 4) = html.querySelectorAll("tr").Item(j).getElementsByTagName("b")(0).innerText
Exit For
End If
Next
gatheredLinks(x) = link
Next x
' With xhr
' For x = LBound(gatheredLinks) To UBound(gatheredLinks)
' .Open "GET", gatheredLinks(x), False
' .setRequestHeader "Referer", "https://www.zvg-portal.de/index.php?button=Suchen"
' .setRequestHeader "Content-Type", "text/html; charset=ISO-8859-1"
' .setRequestHeader "User-Agent", "python-requests/2.24.0"
' .setRequestHeader "Accept-Encoding", "gzip, deflate"
' .setRequestHeader "Connection", "keep-alive"
' .setRequestHeader "Accept", "text/html,application/xhtml+xml,application/xml;"
' .send
' ActiveSheet.Cells(x + 2, 5) = .Status
' html.body.innerHTML = .responseText 'test response
' Dim s As String
' s = .responseText
' ActiveSheet.Cells(x + 2, 6) = s
' Stop
'
' 'do something else
' Next
' End With
Stop
End Sub
Python(使用会话)可以成功地从结果链接中检索内容:
import requests
from bs4 import BeautifulSoup as bs
data = {'ger_name': 'Norderstedt','order_by': '2','land_abk': 'sh','ger_id': 'X1526'}
headers = {'Referer': 'https://www.zvg-portal.de/index.php?button=Suchen'}
with requests.Session() as s:
r = s.post('https://www.zvg-portal.de/index.php?button=Suchen', data=data)
print(r.status_code)
soup = bs(r.content, 'lxml')
links = ['https://www.zvg-portal.de/' + i['href'] for i in soup.select('td a')]
s.headers = headers
for link in links:
r = s.get(link)
# print(r.status_code)
soup = bs(r.content, 'lxml')
print(soup.select_one('td p').text)
会话不需要。它只是为了提高效率。
没有仍然有效的会话,发送的标头是:
{'User-Agent': 'python-requests/2.24.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive', 'Referer': 'https://www.zvg-portal.de/index.php?button=Suchen'}