使用标签导航html表以提取内部文本(vbs)

时间:2015-02-27 07:24:46

标签: regex web-scraping vbscript xmlhttprequest wsh

我试图获取的数据示例(查看来源):https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=8B7B2B88CE03567735560917596FA6BD

源代码如下:

<table width="100%" border="0" cellspacing="0" cellpadding="2" bgcolor="#e9edf2">
<tr>
    <td width="2%" height="20"><font size="1">&nbsp;</font></td>
    <td width="30%" valign="top"><font face="verdana" size="-1">Name:</font></td>
    <td width="48%">
        <font face="verdana" size="-1"><b>ABBOTT, HUGH ALLAN&nbsp;<small>(Primary Name)</small></b></font></td>

</tr>

我无法弄清楚如何故意导航到源的这一部分。我需要以某种方式告诉它用TD搜索所有标签&#34;名称:&#34;如果它存在,请给我标签的下一个内容

<b> 

在这种情况下是ABBOTT,HUGH ALLAN。我需要这种类型的方法,因为当项目的位置发生变化时,使用项目(#)查找特定文本并不可靠。我尝试过几种不同的方法,但到目前为止还没有成功。例如&#34;对于每个td ....&#34;方法类型。我最终可以找到正确的项目,但它在多个记录中不可靠。

TIA

编辑 - 这是我关闭的代码:

这假设您有一个包含此路径/名称的文本文件(尽管它表示路径中的电子邮件抓取,而不是在这种情况下尝试收到电子邮件):&#34; C:\ Emailgrab \ myfloridalicense.com \ Extract URL \ AgentURLsRaw_Clean.txt&#34;有这些链接:

https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=2BEA648A94BA20C0C989E9E0071103AF https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=AB8F78E2835A25C2D443B09DE9CDD16F https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=A6DBB6CDEE69A637B4497807A1FE45A6 https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=8B7B2B88CE03567735560917596FA6BD https://www.myfloridalicense.com/LicenseDetail.asp?SID=&id=27A84B8EF8F96AD4F09AF94774456A39

还假设您在此路径/名称中有此头文件:&#34; C:\ Emailgrab \ myfloridalicense.com \ Extract URL \ Complete.csv&#34;使用这些标题:

姓氏,名/中名,地址,许可证号,许可证状态(a),许可证状态(b),许可证到期,URL

VBS代码:

Dim URLFile 
Dim fName
set ie = createobject("internetexplorer.application")
IE.Visible = True
Set objShell = CreateObject("WScript.Shell")
Set WshShell = WScript.CreateObject("WScript.Shell") 
set fso = createobject("scripting.filesystemobject")
Set URLFile = fso.OpenTextFile("C:\Emailgrab\myfloridalicense.com\Extract URL\AgentURLsRaw_Clean.txt")

do while not URLFile.AtEndOfStream 
fName = URLFile.ReadLine()

ie.navigate fName
do until ie.readystate = 4 : wscript.sleep 10: loop 

For Each elm In IE.Document.getElementsByTagName("table")
If elm.getElementsByTagName("TABLE").Length = 16 THEN

name = elm.document.getElementsByTagName("b").item(3).innertext
address = elm.document.getElementsByTagName("b").item(5).innertext
licensenumber = elm.document.getElementsByTagName("b").item(12).innertext
licensestatus = elm.document.getElementsByTagName("b").item(13).innertext
licenseexp = elm.document.getElementsByTagName("b").item(15).innertext

myData =  name & ", " & replace(address, vbCrLf, "") & ", " & licensenumber & ", " & licensestatus & ", " & licenseexp & "," & fName & ", " & vbCrlf

set fso = createobject("scripting.filesystemobject")
set ts = fso.opentextfile("C:\Emailgrab\myfloridalicense.com\Extract URL\Complete.csv",8,true) 
ts.write myData 
ts.close 

end if
next

loop

Wscript.Echo "All Data Copied!"

我接近我的方法,但我不太明白为什么它适用于某些链接而不适用于其他链接。您可以在CSV中看到数据已关闭,在某些情况下,从许可证编号开始,其中会关闭其余列。此外,在某些情况下,数据中有逗号,因此我添加了一个额外的LicenseStatus列以尝试解释此问题。我只需要一个(最好是简单的)比使用Item(#)计数更可靠的方法。希望这可以帮助。谢谢!

2 个答案:

答案 0 :(得分:1)

<强>更新

这是基于HTTP请求和RegExp解析到Dictionary的实现,它将带有URL的txt文件作为输入,并将结果写入csv文件:

arrUrls = Split(ReadTextFile("C:\Emailgrab\myfloridalicense.com\Extract URL\AgentURLsRaw_Clean.txt", 0), vbCrLf)
sCsv = ""
For Each sUrl in arrUrls
    XmlHttpRequest "GET", sUrl, "", "", "", sRespText
    HtmlSimplify sRespText
    ParseToDict "<tr><td></td><td>([^<]*?)</td><td>([^<]*?)(?:</td>){0,1}</tr>", sRespText, oResult
    sCsv = sCsv & """" & oResult("Name:") & """" & ","
    sCsv = sCsv & """" & oResult("Main Address:") & """" & ","
    sCsv = sCsv & """" & oResult("License Number:") & """" & ","
    sCsv = sCsv & """" & oResult("Status:") & """" & ","
    sCsv = sCsv & """" & oResult("Expires:") & """" & ","
    sCsv = sCsv & """" & sUrl & """" & vbCrLf
Next
WriteTextFile sCsv, "C:\Emailgrab\myfloridalicense.com\Extract URL\Complete.csv", 0
WScript.Echo "All Data Copied!"

Function ReadTextFile(sPath, iFormat)
    With CreateObject("Scripting.FileSystemObject").OpenTextFile(sPath, 1, False, iFormat)
        ReadTextFile = ""
        If Not .AtEndOfStream Then ReadTextFile = .ReadAll
        .Close
    End With
End Function

Sub WriteTextFile(sCont, sPath, iFormat)
    With CreateObject("Scripting.FileSystemObject").OpenTextFile(sPath, 2, True, iFormat)
        .Write(sCont)
        .Close
    End With
End Sub

Sub HtmlSimplify(sCont)
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = "(<[\w\/^<]*)[\s\S]*?>"
        sCont = .Replace(sCont, "$1>")
        .Pattern = "(?:<font>|</font>|<b>|</b>|<small>|</small>|<br>)"
        sCont = .Replace(sCont, "")
        .Pattern = "&nbsp;"
        sCont = .Replace(sCont, " ")
        .Pattern = "[\f\n\r\t\v]"
        sCont = .Replace(sCont, "")
        .Pattern = " +"
        sCont = .Replace(sCont, " ")
        .Pattern = "> <"
        sCont = .Replace(sCont, "><")
    End With
End Sub

Sub XmlHttpRequest(sMethod, sUrl, arrSetHeaders, sFormData, sRespHeaders, sRespText)
    Dim arrHeader
    With CreateObject("Msxml2.ServerXMLHTTP.3.0")
        .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        .Open sMethod, sUrl, False
        If IsArray(arrSetHeaders) Then
            For Each arrHeader In arrSetHeaders
                .SetRequestHeader arrHeader(0), arrHeader(1)
            Next
        End If
        .Send sFormData
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With
End Sub

Sub ParseToDict(sPattern, sResponse, oDict)
    Dim oMatch, arrSMatches, sSubMatch
    Set oDict = CreateObject("Scripting.Dictionary")
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If Trim(oMatch.SubMatches(0)) <> "" Then oDict(oMatch.SubMatches(0)) = oMatch.SubMatches(1)
        Next
    End With
End Sub

每个网页解析数据都被放入字典中,项目名称作为关键字。 csv的内容是通过引用值的名称而不是相对位置来创建的,因此所有列都位于列出的任何URL的位置。
另外,我添加了双引号,以避免使用逗号Current,Inactive(RFC 4180 point 2.6)拆分为单独的列值。
它不是任何网站的通用解决方案。对于另一个网站,您必须创建适当的RegExp模式进行解析。在这种情况下,我在HtmlSimplify之后将HTML内容保存到文件中,并检查它以找出理想的模式。更多的是,要将这种方法用于更复杂的站点,您可能需要通过切断多余的HTML部分来解析几个步骤(可能是循环的),从而缩小搜索范围。

<强> SOURSE

考虑这个VBS解析器:

' sCont contains table HTML at the moment

With CreateObject("VBScript.RegExp")
    .Global = True
    .MultiLine = True
    .IgnoreCase = True
    ' content simplification
    .Pattern = "(<[\w\/^<]*)[\s\S]*?>"
    sCont = .Replace(sCont, "$1>")
    .Pattern = "(?:<font>|</font>|<b>|</b>|<small>|</small>|<br>)"
    sCont = .Replace(sCont, "")
    .Pattern = "&nbsp;"
    sCont = .Replace(sCont, " ")
    .Pattern = "[\f\n\r\t\v]"
    sCont = .Replace(sCont, "")
    .Pattern = " +"
    sCont = .Replace(sCont, " ")
    .Pattern = "> <"
    sCont = .Replace(sCont, "><")
End With
ParseToArray "<tr><td></td><td>([^<]*?)</td><td>([^<]*?)(?:</td>){0,1}</tr>", sCont, arrResult

' continue processing of arrResult
WScript.Echo arrResult(0)(1) ' eg shows name
' ...

Sub ParseToArray(sPattern, sResponse, arrMatches)
    Dim oMatch, arrSMatches, sSubMatch
    arrMatches = Array()
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .IgnoreCase = True
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            arrSMatches = Array()
            For Each sSubMatch in oMatch.SubMatches
                PushItem arrSMatches, sSubMatch
            Next
            PushItem arrMatches, arrSMatches
        Next
    End With
End Sub

Sub PushItem(arrList, varItem)
    ReDim Preserve arrList(UBound(arrList) + 1)
    arrList(UBound(arrList)) = varItem
End Sub

sCont开始,如下所示:

<table cellspacing="0" cellpadding="1" width="100%" border="0" bgcolor="#b6c9dc"><tr><td>
    <table width="100%" border="0" cellspacing="0" cellpadding="3"><tr>
        <td width="32%"><font face="verdana" size="-1"><b>Licensee Information</b></font></td>
    </tr></table>
    <table width="100%" border="0" cellspacing="0" cellpadding="2" bgcolor="#e9edf2">
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">Name:</font></td>
        <td width="48%">
            <font face="verdana" size="-1"><b>ABBOTT, HUGH ALLAN&nbsp;<small>(Primary Name)</small></b></font></td>

    </tr>
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1"></font></td>
        <td width="48%"><font face="verdana" size="-1"><b>&nbsp;<small>(DBA Name)</small></b></font></td>
    </tr>
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">Main Address:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b>318 TURKEY CREEK 
        <br>ALACHUA&nbsp;&nbsp;Florida&nbsp;&nbsp;32615</b></font>
        </td>
    </tr>

    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">County:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b>ALACHUA
    </tr>



    <tr><td>&nbsp;</td></tr>
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">License Mailing:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b>318 TURKEY CREEK

        <br>ALACHUA&nbsp;&nbsp;FL &nbsp;&nbsp;32615</b></font>

    </tr>

    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">County:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b>ALACHUA
    </tr>

    <tr><td>&nbsp;</td></tr>
    <tr>
        <td width="2%" height="20"><font size="1">&nbsp;</font></td>
        <td width="30%" valign="top"><font face="verdana" size="-1">LicenseLocation:</font></td>
        <td width="68%"><font face="verdana" size="-1"><b> 

        <br>&nbsp;&nbsp; &nbsp;&nbsp; </b></font>

    </tr>

    </table>

</td></tr></table>

它获得了本地调试器截图所示的子数组数组:

Locals array screenshot

答案 1 :(得分:0)

我无法完全实现您的方法,但您的响应的一些元素DID引导我找到一个有效的解决方案。如果“许可证号”列包含许可证号以外的其他内容,我使用IF语句的组合将每个项目移动到必要数量的列上。还使用您的建议让替换功能处理逗号问题。这就是我的问题的一部分就是把事情搞砸了。

Dim URLFile 
Dim fName
set ie = createobject("internetexplorer.application")
IE.Visible = True
Set objShell = CreateObject("WScript.Shell")
Set WshShell = WScript.CreateObject("WScript.Shell") 
set fso = createobject("scripting.filesystemobject")
Set URLFile = fso.OpenTextFile("C:\Emailgrab\myfloridalicense.com\Extract URL\AgentURLsRaw_Clean.txt")

do while not URLFile.AtEndOfStream 
fName = URLFile.ReadLine()

ie.navigate fName
do until ie.readystate = 4 : wscript.sleep 10: loop 

For Each elm In IE.Document.getElementsByTagName("table")
If elm.getElementsByTagName("TABLE").Length = 16 THEN

name = elm.document.getElementsByTagName("b").item(3).innertext
address = replace(elm.document.getElementsByTagName("b").item(5).innertext,","," ")

'License Number
If InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Real Estate Broker or Sales") THEN 
licensenumber = elm.document.getElementsByTagName("b").item(14).innertext 
ELSEIf InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Broker") or InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Sales Associate") THEN
licensenumber = elm.document.getElementsByTagName("b").item(13).innertext
Else licensenumber = elm.document.getElementsByTagName("b").item(12).innertext
End If

'License Status
If InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Real Estate Broker or Sales") THEN 
licensestatus = elm.document.getElementsByTagName("b").item(15).innertext 
ELSEIf InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Broker") or InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Sales Associate") THEN
licensestatus = elm.document.getElementsByTagName("b").item(14).innertext
Else licensestatus = elm.document.getElementsByTagName("b").item(13).innertext
End If

'License Exp
If InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Real Estate Broker or Sales") THEN 
licenseexp = elm.document.getElementsByTagName("b").item(17).innertext 
ELSEIf InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Broker") or InStr(elm.document.getElementsByTagName("b").item(12).innertext, "Sales Associate") THEN
licenseexp = elm.document.getElementsByTagName("b").item(16).innertext
Else licenseexp = elm.document.getElementsByTagName("b").item(15).innertext
End If

myData =  name & ", " & replace(address, vbCrLf, "") & ", " & replace(licensenumber, ","," ") & ", " & replace(licensestatus, ","," ") & ", " & replace(licenseexp,","," ") & "," & fName & ", " & vbCrlf

set fso = createobject("scripting.filesystemobject")
set ts = fso.opentextfile("C:\Emailgrab\myfloridalicense.com\Extract URL\Complete.csv",8,true) 
ts.write myData 
ts.close 

end if
next

loop

Wscript.Echo "All Data Copied!"