<article or =“” <span =“” in =“” source =“”

时间:2018-10-23 19:41:36

标签: html vba excel-vba web-scraping

=“”

如果不在一个框架?我正在努力从该站点https://ai.fmcsa.dot.gov/SMS/Carrier/621247/CarrierRegistration.aspx收集一些详细信息,而我无法在其中提取数据。

我正在尝试从<span class="dat">中提取<div id="regBox">项中的文本,尽管没有检索。想要法定名称,地址,行驶里程和电子邮件。整个“车辆类型”细分也列在1个单元格的单独单元格中。

可以做到吗?

Sub ScrapeFMSCA(DOTNum)


Dim ie As Object
Dim ieDoc As Object
Dim ieEle As Object

Dim k As Long
Dim s As Object
Dim P As String
Dim txt As String
Dim rng As Range, cname As String
Dim r As Integer, c As Integer
Dim elemCollection As Object, curHTMLRow As Object

Application.ScreenUpdating = True

Set ie = CreateObject("InternetExplorer.Application")

k = 2

With ie
    .Visible = True

    URL = "https://ai.fmcsa.dot.gov/SMS/Carrier/" & DOTNum & "/CarrierRegistration.aspx"
    Application.StatusBar = " Logging In "
    .Navigate URL

    Do While ie.Busy: DoEvents: Loop '** Wait til page loaded
    Do While ie.ReadyState <> 4: DoEvents: Loop '** Wait til IE READY

    Set ieDoc = ie.Document
    Set NodeList = ieDoc.getElementsByTagName("article").getElementsByTagName("span").getElementsByClassName("dat")(1)
    MsgBox NodeList.span

    cTime = Now + TimeValue("00:01:00")
    Do Until (ie.ReadyState = 4 And Not ie.Busy)
        If Now < cTime Then
            DoEvents
        Else
            GoTo Here1
        End If
    Loop

Here1:

    ie.Quit

End With

End Sub

3 个答案:

答案 0 :(得分:1)

我将如下所示进行操作。在编写此解决方案时,您会考虑将来在多个DOTNum上循环。我已经对3个数字进行了循环测试,效果很好。

访问方式:

我使用XMLHTTP请求作为比启动IE更快的方法。

注册信息:

我通过将CSS类选择器应用于类为.dat的目标元素而获得的注册信息。然后,我索引由nodeList返回的querySelectorAll,以检索所需的元素。

车辆类型明细:

我最初按索引和标签.getElementsByTagName("table")(0)抓取的完整车辆类型明细表。

该表的布局有些棘手。例如,第一列元素实际上是th而不是td标记的。通过首先使用thead th的CSS后代选择器组合隔离实际的标头,可以解决此问题。这仅针对表头中的th元素。然后,我在CSS后代选择器组合中使用CSS OR运算符来拉回第一列th元素或其余列td标记元素:tbody tr th,td。 我使用mod 4来确定它是否是第一列,并相应地调整写出到新行。

助手(最佳做法是模块化代码):

我已经使用了GetLastRow辅助函数来确定从哪里开始写,因为您似乎最终将在一个循环中将其部署在不同的DOTnum上。我使用一个类来保存XMLHTTP对象。

WriteTable在is上说。它写出表格。

点数:

我从名为DOTNumbers的工作表中读取了dotNums。在我的示例中,我使用3个数字来获取{529136,621247,2474795}的信息。数组dotNums会被表单中的这些值填充,并被循环以提供要添加到网址中的dotNum。


CSS查询中的OR运算符示例(示例):


示例输出:


VBA:

类模块clsHTTP:

Option Explicit    
Private http As Object

Private Sub Class_Initialize()
    Set http = CreateObject("MSXML2.XMLHTTP")
End Sub

Public Function GetString(ByVal url As String) As String
    Dim sResponse As String
    With http
        .Open "GET", url, False
        .setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
        .send
        sResponse = StrConv(.responseBody, vbUnicode)
        GetString = sResponse
    End With
End Function

标准模块1:

Option Explicit    
Public Sub GetInfo()
    Dim html As HTMLDocument, headers1(), hTable As HTMLTable
    Dim ws As Worksheet, wsDotNums As Worksheet, registrationinfo As Object, nextRow As Long
    Dim dotNums(), http As clsHTTP, url As String, i As Long

    Application.ScreenUpdating = True

    Set http = New clsHTTP
    Set ws = ThisWorkbook.Worksheets("Sheet1")
    Set wsDotNums = ThisWorkbook.Worksheets("DOTNumbers")
    Set html = New HTMLDocument

    dotNums = Application.Transpose(wsDotNums.Range("A1:A3").Value) '<== Change the range here to the single column range containing your dotNums.

    For i = LBound(dotNums) To UBound(dotNums)
        If Not IsEmpty(dotNums(i)) Then
            With html
                url = "https://ai.fmcsa.dot.gov/SMS/Carrier/" & dotNums(i) & "/CarrierRegistration.aspx"
                html.body.innerHTML = http.GetString(url)
                Set registrationinfo = .querySelectorAll(".dat")
                Set hTable = .getElementsByTagName("table")(0)
            End With

            headers1 = Array("Legal Name", "Address", "Miles Traveled ", "Email")

            nextRow = IIf(GetLastRow(ws, 1) = 1, 1, GetLastRow(ws, 1) + 2)

            With ws
                .Cells(nextRow, 1).Resize(1, UBound(headers1) + 1) = headers1
                .Cells(nextRow + 1, 1) = registrationinfo.item(0).innerText
                .Cells(nextRow + 1, 2) = registrationinfo.item(3).innerText
                .Cells(nextRow + 1, 3) = registrationinfo.item(7).innerText
                .Cells(nextRow + 1, 4) = registrationinfo.item(6).innerText
            End With

            WriteTable hTable, nextRow + 3, ws

        End If     
    Next      
    Application.ScreenUpdating = True
End Sub

Public Sub WriteTable(ByVal hTable As HTMLTable, Optional ByVal startRow As Long = 1, Optional ByVal ws As Worksheet)
    If ws Is Nothing Then Set ws = ActiveSheet

    Dim r As Long, c As Long, i As Long, headers As Object
    r = startRow
    With ws
        Set headers = hTable.querySelectorAll("thead th")
        For i = 0 To headers.Length - 1
            .Cells(r, i + 1) = headers.item(i).innerText
        Next

        Dim tableContents As Object
        Set tableContents = hTable.querySelectorAll("tbody tr th,td")

        For i = 0 To tableContents.Length - 1
            If i Mod 4 = 0 Then
                c = 1: r = r + 1
            Else
                c = c + 1
            End If
            .Cells(r, c) = tableContents.item(i).innerText
        Next
    End With
End Sub

Public Function GetLastRow(ByVal ws As Worksheet, Optional ByVal columnNumber As Long = 1) As Long
    With ws
        GetLastRow = .Cells(.Rows.Count, columnNumber).End(xlUp).Row
    End With
End Function

答案 1 :(得分:1)

您可能还会考虑通过以下方式出手。它应该获取您所有上述字段。我试图将其压缩到单个子容器中。但是,如果您有循环要进行脚本,则不要搞乱。我在脚本中使用了简单的选择器以及嵌套循环来完成您的工作。希望这会有所帮助。

Sub FetchData()
    Const link As String = "https://ai.fmcsa.dot.gov/SMS/Carrier/621247/CarrierRegistration.aspx"
    Dim Httpreq As New XMLHTTP60, Htmldoc As New HTMLDocument
    Dim R&, I&, tR&, N&, C&

    With Httpreq
        .Open "GET", link, False
        .send
        Htmldoc.body.innerHTML = .responseText
    End With

    [A1:C1] = [{"Legal Name", "Address", "Miles Traveled"}]

    With Htmldoc.querySelectorAll("#regBox label,#regBox h3")
        For R = 0 To .Length - 1
            If .item(R).innerText Like "*Legal Name*" Then
                I = I + 1: Cells(I + 1, 1) = .item(R).NextSibling.innerText
            End If

            If .item(R).innerText Like "*Address*" Then
                Cells(I + 1, 2) = .item(R).NextSibling.innerText
            End If

            If .item(R).innerText Like "*Vehicle Miles Traveled*" Then
                Cells(I + 1, 3) = .item(R).NextSibling.innerText
            End If

            If .item(R).innerText Like "*Vehicle Type Breakdown*" Then
                With .item(R).NextSibling.Rows
                    For tR = 0 To .Length - 1
                        With .item(tR).Cells
                            For N = 0 To .Length - 1
                                C = C + 1: Cells(I + 2, C) = .item(N).innerText
                            Next N
                        End With
                        I = I + 1: C = 0
                    Next tR
                End With
            End If
        Next R
    End With
End Sub

答案 2 :(得分:0)

运行以下宏,您将获得前三个必填字段:

Sub GetInformation()
    Const Url$ = "https://ai.fmcsa.dot.gov/SMS/Carrier/621247/CarrierRegistration.aspx"
    Dim Http As New xmlhttp60, Html As New HTMLDocument, post As Object, I&

    With Http
        .Open "GET", Url, False
        .send
        Html.body.innerHTML = .responseText
    End With

    For Each post In Html.getElementsByTagName("label")
        If InStr(post.innerText, "Legal Name") > 0 Then
            I = I + 1: Cells(I, 1) = post.NextSibling.innerText
        End If

        If InStr(post.innerText, "Address") > 0 Then
            Cells(I, 2) = post.NextSibling.innerText
        End If

        If InStr(post.innerText, "Vehicle Miles Traveled") > 0 Then
            Cells(I, 3) = post.NextSibling.innerText
        End If
    Next post
End Sub

添加参考:

Microsoft xml, v6.0
Microsoft Html Object Library