使用硒在本地html中刮擦嵌套表的表

时间:2018-12-10 19:55:20

标签: html excel vba selenium web-scraping

在[Link}(Scraping table from local HTML with unicode characters)这个线程中,QHarr帮助我从本地html文件中抓取了一个表。 我在此Link

有一个html文件

然后我使用相同的代码,并对变量“ startTableNumber”,“ endTableNumber”和“ numColumns”进行了一些修改

Public Sub Test()
Dim fStream  As ADODB.Stream, html As HTMLDocument
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
With fStream
    .Charset = "UTF-8"
    .Open
    .LoadFromFile "C:\Users\Future\Desktop\Sample 2.html"
    html.body.innerHTML = .ReadText
    .Close
End With

Dim hTables As Object, startTableNumber As Long, i As Long, r As Long, c As Long
Dim counter As Long, endTableNumber, numColumns As Long

startTableNumber = 91
endTableNumber = 509
numColumns = 14

Set hTables = html.getElementsByTagName("table")
r = 2: c = 1

For i = startTableNumber To endTableNumber Step 2
    counter = counter + 1
    If counter = 10 Then
        c = 1: r = r + 1: counter = 1
    End If
    Cells(r, c) = hTables(i).innerText
    c = c + 1
Next

End Sub

但是我想进一步分散表的数据,我想找到一种灵活的方法,使代码无需手动分配即可识别这些变量。 我希望找到使用硒的解决方案。希望也不要收到负面代表。我已尽力澄清问题 问候

1 个答案:

答案 0 :(得分:1)

因此,正如我在评论中所说,您需要研究数据在后面的表格标签中的显示方式,并执行映射以获取正确的顺序。下面写出表格。正如我还提到的那样,这并不可靠,只有该方法可能可以转移到其他文档。

在您的情况下,您不会从文件中读取文件,而是会使用

Set tables = driver.FindElementsByCss("table[width='100%'] table:first-child")

然后您将For Each遍历集合中的Web元素,根据需要调整语法,例如.Text,而不是.innerText。硒由于对webElements的索引编制,可能还会有其他一些对硒的适应方法,但是您需要做的所有事情在下面都应该显而易见。

VBA:

Option Explicit
Public Sub ParseInfo()
    Dim html As HTMLDocument, tables As Object, ws As Worksheet, i As Long
    Set ws = ThisWorkbook.Worksheets("Sheet2")
    Dim fStream  As ADODB.Stream
    Set html = New HTMLDocument
    Set fStream = New ADODB.Stream
    With fStream
        .Charset = "UTF-8"
        .Open
        .LoadFromFile "C:\Users\User\Desktop\test.html"
        html.body.innerHTML = .ReadText
        .Close
    End With

    Set tables = html.querySelectorAll("table[width='100%'] table:first-child")
    Dim rowCounter: rowCounter = 2
    Dim mappings(), j As Long, headers(), arr(13)
    headers = Array("Notes", "Type", "Enrollment status", "Governorate of birth", "Year", "Month", "Day", "Date of Birth", "Religion", _
    "Nationality", "Student Name", "National Number", "Student Code", "M")

    mappings = Array(3, 8, 9, 12, 11, 10, 2, 7, 1, 6, 5, 4, 13)
    ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers

    For i = 89 To 504 Step 26
        arr(0) = vbNullString

        For j = 0 To 12
            arr(mappings(j)) = tables.item(2 * j + i).innerText
        Next

        ws.Cells(rowCounter + 1, 1).Resize(1, UBound(arr) + 1) = arr
        rowCounter = rowCounter + 1
    Next
End Sub