从网站抓取表格

时间:2018-08-20 19:24:29

标签: html excel vba web-scraping

我有以下代码导航到网站,输入两个名称(例如,此处使用的真实姓名将从电子表格中提取10个名称的列表),然后搜索其记录。我正在尝试将生成的结果表提取到电子表格中。我已经尝试了几种方法,但似乎无法使其正常工作。在注释“ Scrape Table Here”下查找代码。我知道这涉及到访问网站的HTML,我也可以这样做,但是我对HTML不够熟悉,无法独自解决这一问题。奖励问题:我还想将每个人的ID#添加到电子表格中。在HTML中,它在“ MP_Details?”之后列出。例如,对于“罗伯特·琼斯”,我想要的是“ 36481”。基本上,屏幕快照中所有以红色突出显示的内容,我都想从表中拉出并在电子表格中吐出:

Sub Input_And_Return()

'Create new instance of Internet Explorer
Dim ieApp As Object: Set ieApp = New InternetExplorer
Dim ieDoc As Object
Dim html As HTMLDocument

ieApp.Visible = True
ieApp.navigate "https://hdmaster.net/MP/MP_Public"
Do While ieApp.Busy: DoEvents: Loop
Do Until ieApp.readyState = READYSTATE_COMPLETE: DoEvents: Loop

Set ieDoc = ieApp.document
Set html = ieApp.document

'Enter names into search box and click search
With ieDoc.forms(0)
    .SearchFor.Value = "Anderson, Kelly" & Chr(10) & "Jones, Robert"
    .submit
End With

'Scrape Table Here

'Close down IE and reset status bar
Set ieApp = Nothing
Application.StatusBar = ""

End Sub

HTML Screenshot

1 个答案:

答案 0 :(得分:0)

您可以将表格externalHTML复制到剪贴板并将其粘贴到Excel。很好,简单,快捷。

Option Explicit
Public Sub GetInfo()
    Dim IE As New InternetExplorer
    Dim nameList As String
    nameList = "Anderson, Kelly" & Chr$(10) & "Jones, Robert"
    With IE
        .Visible = True
        .navigate "https://hdmaster.net/MP/MP_Public"

        While .Busy Or .readyState < 4: DoEvents: Wend

        With .document
            .querySelector("[name=SearchFor]").Value = nameList
            .querySelector("#search").Click
        End With

        While .Busy Or .readyState < 4: DoEvents: Wend

        Dim clipboard As Object
        Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
        clipboard.SetText .document.querySelector(".newTable").outerHTML
        clipboard.PutInClipboard
        ThisWorkbook.Worksheets("Sheet1").Cells(1, 1).PasteSpecial
        .Quit
    End With
End Sub

参考(VBE>工具>参考):

  1. Microsoft HTML对象库
  2. Microsoft Internet控件

您的上述代码版本:

Public Sub Input_And_Return()
    Dim ieApp As Object: Set ieApp = New InternetExplorer
    Dim ieDoc As Object

    With ieApp
        .Visible = True
        .navigate "https://hdmaster.net/MP/MP_Public"
        While .Busy Or .readyState < 4: DoEvents: Wend

        With .document.forms(0)
            .SearchFor.Value = "Anderson, Kelly" & Chr$(10) & "Jones, Robert"
            .submit
            Dim clipboard As Object
            Set clipboard = GetObject("New:{1C3B4210-F441-11CE-B9EA-00AA006B1A69}")
            clipboard.SetText .getElementsByClassName("newTable")(0).outerHTML
            clipboard.PutInClipboard
        End With
        ThisWorkbook.Worksheets("Sheet1").Cells(1, 1).PasteSpecial
        .Quit
    End With
End Sub

或通过循环表的行和列:

Public Sub Input_And_Return()
    Dim ieApp As Object, ieDoc As Object
    Set ieApp = New InternetExplorer
    With ieApp
        .Visible = True
        .navigate "https://hdmaster.net/MP/MP_Public"
        While .Busy Or .readyState < 4: DoEvents: Wend

        With .document.forms(0)
            .SearchFor.Value = "Anderson, Kelly" & Chr$(10) & "Jones, Robert"
            .submit
            Dim r As Long, c As Long, tr As Object, td As Object
            With .getElementsByClassName("newTable")(0)
                For Each tr In .getElementsByTagName("tr")
                    r = r + 1: c = 1
                    For Each td In tr.getElementsByTagName("td")
                        Cells(r, c).Value = td.innerText
                        c = c + 1
                    Next td
                Next tr
            End With
        End With
        .Quit
    End With
End Sub

输出:

output


编辑:

一些丑陋的代码来获取短ID

Option Explicit
Public Sub Input_And_Return()
    Dim ieApp As Object, ieDoc As Object
    Set ieApp = New InternetExplorer
    With ieApp
        .Visible = True
        .navigate "https://hdmaster.net/MP/MP_Public"
        While .Busy Or .readyState < 4: DoEvents: Wend

        With .document.forms(0)
            .SearchFor.Value = "Anderson, Kelly" & Chr$(10) & "Jones, Robert"
            .submit
            Dim r As Long, c As Long, tr As Object, td As Object, hTable As Object, aNodeList As Object
            Set hTable = .getElementsByClassName("newTable")(0)
            Set aNodeList = .getElementsByClassName("newTable")(0).querySelectorAll("[align=center][onclick*='javascript:rowClick']")

            Dim idDict As Object, i As Long, tempVal As Long
            Set idDict = CreateObject("Scripting.Dictionary")

            For i = 0 To aNodeList.Length - 1
                tempVal = Split(Split(aNodeList.Item(i).onclick, "id=")(1), Chr$(39))(0)
                If Not idDict.exists(tempVal) Then idDict.Add tempVal, vbNullString
            Next i

            With hTable
                For Each tr In .getElementsByTagName("tr")
                    r = r + 1: c = 1
                    For Each td In tr.getElementsByTagName("td")
                        Cells(r, c).Value = td.innerText
                        c = c + 1
                    Next td
                Next tr
               If idDict.Count = r - 1 Then Cells(2, c).Resize(idDict.Count, 1) = Application.WorksheetFunction.Transpose(idDict.keys)
            End With
        End With
        .Quit
    End With
End Sub