从本地html文件导入数据后调整结果

时间:2018-12-11 15:02:35

标签: excel vba excel-vba html-table

在出色的成员@QHarr的帮助下,我有以下代码使我能够从html本地文件中抓取数据,而且很好

Sub Test()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, fStream As ADODB.Stream
Dim headers(), mappings(), arr(13), newarr(13), cnt As Long, i As Long, j As Long, n As Long
Dim xFd As FileDialog, sFile As Variant, sSchool As String, sFolder As String, x As Long

Set ws = ThisWorkbook.Worksheets("Results")
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
Set xFd = Application.FileDialog(msoFileDialogFolderPicker)
xFd.Title = "Please Select The Original Folder:"
If xFd.Show = -1 Then sFolder = xFd.SelectedItems(1) & "\" Else Exit Sub
sSchool = Split(sFolder, "\")(UBound(Split(sFolder, "\")) - 1)
sFile = Dir(sFolder)

cnt = ws.Cells(Rows.Count, 1).End(xlUp).Row: x = cnt
headers = Array("م", "كود الطالب", "الرقم القومي", "اسم الطالب", "الجنسية", "الديانة", "تاريخ الميلاد", "يوم", "شهر", "سنة", "محافظة الميلاد", "حالة القيد", "النوع", "ملاحظات")
mappings = Array(3, 8, 9, 12, 11, 10, 2, 7, 1, 6, 5, 4, 13)
If IsEmpty(ws.Cells(1, 1).Value) Then ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers

Application.ScreenUpdating = False
    While sFile <> ""
        With fStream
            .Charset = "UTF-8"
            .Open
            .LoadFromFile sFolder & sFile
            html.body.innerHTML = .ReadText
            .Close
        End With

        Set tables = html.querySelectorAll("table[width='100%'] table:first-child")

        For i = 89 To tables.Length - 17 Step 26
            Erase arr
            arr(0) = vbNullString

            For j = 0 To 12
                arr(mappings(j)) = Application.Trim(tables.Item(i + (2 * (j))).innerText)
                If j = 4 And arr(3) = "غير مصرى‏" Then arr(mappings(j)) = 0
            Next j

            For j = UBound(arr) To LBound(arr) Step -1
                newarr(n) = arr(j)
                If n = 6 Then
                    If IsDate(newarr) Then newarr(n) = CDate(Day(newarr(n)) & "/" & Month(newarr(n)) & "/" & Year(newarr(n)))
                End If
                n = n + 1
            Next j

            ws.Cells(cnt + 1, 1).Resize(1, UBound(arr) + 1) = newarr
            cnt = cnt + 1: n = 0
        Next i

        sFile = Dir
    Wend

    ws.Cells(x + 1, 14).Resize(cnt - x).Value = sSchool
    ws.Activate
Application.ScreenUpdating = True
End Sub

当没有国籍ID(即HTML表中的第三列)时,结果的唯一问题是الرقــم القومــي 当它为空时,关于名称以及以下名称,我没有得到正确的结果 如果运行代码,请注意从11到17的行。 附件是FolderToTest,在此LINK

上具有文件

我已尝试解决此问题,但结果有所调整(但仍然不正确,因为缺少具有空国籍ID的名称,并且以下名称包含了他的一些数据) 这是我最后的尝试

Sub Test()
Dim html As HTMLDocument, tables As Object, ws As Worksheet, fStream As ADODB.Stream
Dim headers(), mappings(), arr(13), newarr(13), cnt As Long, i As Long, j As Long, n As Long
Dim xFd As FileDialog, sFile As Variant, sSchool As String, sFolder As String, x As Long

Set ws = ThisWorkbook.Worksheets("Results")
Set html = New HTMLDocument
Set fStream = New ADODB.Stream
Set xFd = Application.FileDialog(msoFileDialogFolderPicker)
xFd.Title = "Please Select The Original Folder:"
If xFd.Show = -1 Then sFolder = xFd.SelectedItems(1) & "\" Else Exit Sub
sSchool = Split(sFolder, "\")(UBound(Split(sFolder, "\")) - 1)
sFile = Dir(sFolder)

cnt = ws.Cells(Rows.Count, 1).End(xlUp).Row: x = cnt
headers = Array("م", "كود الطالب", "الرقم القومي", "اسم الطالب", "الجنسية", "الديانة", "تاريخ الميلاد", "يوم", "شهر", "سنة", "محافظة الميلاد", "حالة القيد", "النوع", "ملاحظات")
mappings = Array(3, 8, 9, 12, 11, 10, 2, 7, 1, 6, 5, 4, 13)
If IsEmpty(ws.Cells(1, 1).Value) Then ws.Cells(1, 1).Resize(1, UBound(headers) + 1) = headers

Application.ScreenUpdating = False
While sFile <> ""
    With fStream
        .Charset = "UTF-8"
        .Open
        .LoadFromFile sFolder & sFile
        html.body.innerHTML = .ReadText
        .Close
    End With

    Set tables = html.querySelectorAll("table[width='100%'] table:first-child")

    For i = 89 To tables.Length - 17 Step 26
        Erase arr
        arr(0) = vbNullString

        For j = 0 To 12
            arr(mappings(j)) = Application.Trim(tables.Item(i + (2 * (j))).innerText)
            'If j = 4 And arr(3) = "غير مصرى‏" Then arr(mappings(j)) = 0
            If j = 3 And Not IsNumeric(Application.Trim(tables.Item(i + (2 * (j)) + 2).innerText)) Then
                i = i + 24
            End If
        Next j

        For j = UBound(arr) To LBound(arr) Step -1
            newarr(n) = arr(j)
            If n = 6 Then
                newarr(n) = CDate(Day(newarr(n)) & "/" & Month(newarr(n)) & "/" & Year(newarr(n)))
            End If
            n = n + 1
        Next j

        ws.Cells(cnt + 1, 1).Resize(1, UBound(arr) + 1) = newarr
        cnt = cnt + 1: n = 0
    Next i

    sFile = Dir
Wend

ws.Cells(x + 1, 14).Resize(cnt - x).Value = sSchool
ws.Activate
Application.ScreenUpdating = True
End Sub

1 个答案:

答案 0 :(得分:1)

您可以在以下方面进行优化。我使用Select Case根据الرقــم القومــي‎的值测试محافظة الميلاد‎中的缺失值。如果محافظة الميلاد‎غير مصرى‏,那么我假设以后会有一个空值,并调整用于填充数组的c计数器。我将进行更新以删除一些不必要的硬编码。

请注意,此处的映射也与以前的文件不同。

Option Explicit

Public Sub ParseInfo()
    Dim html As HTMLDocument, tables As Object, ws As Worksheet, i As Long
    Set ws = ThisWorkbook.Worksheets("Results")
    Dim fStream  As ADODB.Stream
    Set html = New HTMLDocument
    Set fStream = New ADODB.Stream
    With fStream
        .Charset = "UTF-8"
        .Open
        .LoadFromFile "C:\Users\User\Desktop\test.html"
        html.body.innerHTML = .ReadText
        .Close
    End With
    Dim r As Long, c As Long, currentItem As Variant, missingValueFlag As Boolean
    Set tables = html.querySelectorAll("table")
    Dim mappings(), arr()
    ReDim arr(12)
    mappings = Array(2, 7, 8, 11, 10, 9, 1, 6, 0, 5, 4, 3, 12)
    r = 1: c = 1
    For i = 91 To 504 Step 2
        currentItem = tables.item(i).innerText
        Select Case c
        Case 1
            If currentItem = "غير مصرى‏" Then  
                missingValueFlag = True
            End If
        Case 5
            If missingValueFlag Then c = c + 1
        End Select
        arr(mappings(c - 1)) = currentItem
        If c = 13 Then
            ws.Cells(r, 1).Resize(1, UBound(arr) + 1) = arr
            c = 1: r = r + 1
            missingValueFlag = False
            ReDim arr(12)
        Else
            c = c + 1
        End If
    Next
End Sub