
时间:2019-04-27 13:35:49

标签: excel vba web-scraping data-extraction



我想提取比赛和剩余比赛 我想提取每次比赛的主场进球数和客场进球数


Option Explicit

Sub GetSoccerStats()

'Set a reference (VBE > Tools > References) to the following libraries:
'   1) Microsoft XML, v6.0
'   2) Microsoft HTML Object Library

Dim xmlReq As New MSXML2.XMLHTTP60
Dim objDoc As New MSHTML.HTMLDocument
Dim objTable As MSHTML.htmlTable
Dim objTableRow As MSHTML.htmlTableRow
Dim strURL As String
Dim strResp As String
Dim strText As String
Dim rw As Long

strURL = "https://www.betexplorer.com/soccer/south-korea/k-league-1/stats/"

With xmlReq
    .Open "GET", strURL, False
    If .Status <> 200 Then
        MsgBox "Error " & .Status & ":  " & .statusText
        Exit Sub
    End If
    strResp = .responseText
End With


objDoc.body.innerHTML = strResp

Set objTable = objDoc.getElementsByClassName("table-main leaguestats")(0)

If Not objTable Is Nothing Then
    rw = 1
    For Each objTableRow In objTable.Rows
        strText = objTableRow.Cells(0).innerText
        Select Case strText
            Case "Matches played", "Matches remaining", "Home goals", "Away goals"
                Cells(rw, "a").Value = objTableRow.Cells(0).innerText
                Cells(rw, "b").Value = objTableRow.Cells(1).innerText
                Cells(rw, "c").Value = objTableRow.Cells(2).innerText
                rw = rw + 1
        End Select
    Next objTableRow
End If

Set xmlReq = Nothing
Set objDoc = Nothing
Set objTable = Nothing
Set objTableRow = Nothing

End Sub



您可以在这里找到文件: https://www.dropbox.com/s/77sol24sty75w5z/Avg%20Goals.xlsm?dl=0


您会注意到,有一列包含单词CURRENT。这表明它应使用“当前URL”列中的URL。如果我将值更改为LAST,我希望它使用Last URL列中的URL。





2 个答案:

答案 0 :(得分:1)


inputArray = GetLinks(inputArray)


由于您不想一直从表中读取内容,因此我一直在使用数组。这是一项昂贵的操作,会使您的代码变慢。出于相同的原因,如果出现<> 200,则将消息和URL打印到立即窗口,以免降低代码速度。您实际上有一个日志,然后可以在末尾查看。


Option Explicit   
Public Sub GetSoccerStats()
    Dim xmlReq As New MSXML2.XMLHTTP60, response As String
    Dim objDoc As New MSHTML.HTMLDocument, text As String
    Dim lastRow As Long, dataSheet As Worksheet, inputArray(), i As Long

    Set dataSheet = ThisWorkbook.Worksheets("AVG GOAL DATA")

    With dataSheet
        lastRow = .Cells(.Rows.Count, "B").End(xlUp).Row
    End With

    inputArray = dataSheet.Range("J4:L" & lastRow).Value
    inputArray = GetLinks(inputArray)

    Dim results(), r As Long, c As Long
    ReDim results(1 To UBound(inputArray, 1), 1 To 8)

    With xmlReq

        For i = LBound(inputArray, 1) To UBound(inputArray, 1)
            r = r + 1
            .Open "GET", inputArray(i, 4), False
            If .Status <> 200 Then
                Debug.Print inputArray(i, 4), vbTab, "Error " & .Status & ":  " & .statusText
                response = .responseText
                objDoc.body.innerHTML = response

                Dim objTable As MSHTML.HTMLTable, objTableRow As MSHTML.HTMLTableRow

                Set objTable = objDoc.getElementsByClassName("table-main leaguestats")(0)

                If Not objTable Is Nothing Then
                    c = 1
                    For Each objTableRow In objTable.Rows
                        text = objTableRow.Cells(0).innerText
                        Select Case text
                        Case "Matches played", "Matches remaining", "Home goals", "Away goals"
                            results(r, c) = objTableRow.Cells(1).innerText
                            results(r, c + 1) = objTableRow.Cells(2).innerText
                            c = c + 2
                        End Select
                    Next objTableRow
                End If
            End If
            Set objTable = Nothing
    End With
    dataSheet.Range("M4").Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub

Public Function GetLinks(ByRef inputArray As Variant) As Variant
    Dim i As Long
    ReDim Preserve inputArray(1 To UBound(inputArray, 1), 1 To UBound(inputArray, 2) + 1)

    For i = LBound(inputArray, 1) To UBound(inputArray, 1)
        inputArray(i, 4) = IIf(inputArray(i, 1) = "CURRENT", inputArray(i, 2), inputArray(i, 3))
    GetLinks = inputArray
End Function


enter image description here


'VBE > Tools > References:
'1: Microsoft HTML Object library  2: Microsoft Internet Controls
Public Sub GetSoccerStats()
    Dim ie As Object, t As Date
    Dim objDoc As New MSHTML.HTMLDocument, text As String
    Dim lastRow As Long, dataSheet As Worksheet, inputArray(), i As Long

    Const MAX_WAIT_SEC As Long = 10

    Set dataSheet = ThisWorkbook.Worksheets("AVG GOAL DATA")
    Set ie = CreateObject("InternetExplorer.Application")
    With dataSheet
        lastRow = .Cells(.Rows.Count, "B").End(xlUp).Row
    End With

    inputArray = dataSheet.Range("C4:E" & lastRow).Value
    inputArray = GetLinks(inputArray)

    Dim results(), r As Long, c As Long
    ReDim results(1 To UBound(inputArray, 1), 1 To 8)

    With ie
        .Visible = True
        For i = LBound(inputArray, 1) To UBound(inputArray, 1)
            r = r + 1
            .navigate2 inputArray(i, 4)

            While .Busy Or .readyState < 4: DoEvents: Wend

            Dim objTable As MSHTML.HTMLTable, objTableRow As MSHTML.HTMLTableRow
            t = timer
                On Error Resume Next
                Set objTable = .document.getElementsByClassName("table-main leaguestats")(0)
                On Error GoTo 0
                If Timer - t > MAX_WAIT_SEC Then Exit Do
            Loop While objTable Is Nothing

            If Not objTable Is Nothing Then
                c = 1
                For Each objTableRow In objTable.Rows
                    text = objTableRow.Cells(0).innerText
                    Select Case text
                    Case "Matches played", "Matches remaining", "Home goals", "Away goals"
                        results(r, c) = objTableRow.Cells(1).innerText
                        results(r, c + 1) = objTableRow.Cells(2).innerText
                        c = c + 2
                    End Select
                Next objTableRow
            End If
            Set objTable = Nothing
    End With
    dataSheet.Range("F4").Resize(UBound(results, 1), UBound(results, 2)) = results
End Sub

答案 1 :(得分:0)


another_list = list()
for index, a in enumerate(a_list):      
  another_list.append([x[index] for x in a_list])  # IndexError

我可能是错的,但是列Option Explicit Private Sub GetSoccerStats() 'Set a reference (VBE > Tools > References) to the following libraries: ' 1) Microsoft XML, v6.0 ' 2) Microsoft HTML Object Library Dim sourceSheet As Worksheet Set sourceSheet = ThisWorkbook.Worksheets("AVG GOAL DATA") Dim firstRowToFetchDataFor As Long firstRowToFetchDataFor = sourceSheet.Cells(sourceSheet.Rows.Count, "C").End(xlUp).Row + 1 ' Assumes a row needs pulling if the value in column C is blank. Dim lastRowToFetchDataFor As Long lastRowToFetchDataFor = sourceSheet.Cells(sourceSheet.Rows.Count, "B").End(xlUp).Row Dim xmlReq As MSXML2.XMLHTTP60 Set xmlReq = New MSXML2.XMLHTTP60 Dim htmlDoc As MSHTML.HTMLDocument Set htmlDoc = New MSHTML.HTMLDocument Dim rowIndex As Long For rowIndex = firstRowToFetchDataFor To lastRowToFetchDataFor Dim URL As String Select Case LCase$(sourceSheet.Cells(rowIndex, "J")) Case "current" URL = sourceSheet.Cells(rowIndex, "K") Case "last" URL = sourceSheet.Cells(rowIndex, "L") Case Else MsgBox "Expected 'current' or 'last', instead got '" & sourceSheet.Cells(rowIndex, "J") & "' in cell '" & sourceSheet.Cells(rowIndex, "J").Address(False, False) & "'.", vbCritical Application.Goto sourceSheet.Cells(rowIndex, "J") Exit Sub End Select With xmlReq .Open "GET", URL, False .send If .Status <> 200 Then MsgBox "Request returned HTTP " & .Status & ":" & vbNewLine & vbNewLine & .statusText, vbCritical Exit Sub End If htmlDoc.body.innerHTML = .responseText End With Dim htmlTableExtracted As MSHTML.HTMLTable On Error Resume Next Set htmlTableExtracted = htmlDoc.getElementsByClassName("table-main leaguestats")(0) On Error GoTo 0 If Not (htmlTableExtracted Is Nothing) Then Dim tableRow As MSHTML.HTMLTableRow For Each tableRow In htmlTableExtracted.Rows Select Case LCase$(tableRow.Cells(0).innerText) Case "matches played" sourceSheet.Cells(rowIndex, "G") = tableRow.Cells(1).innerText Case "matches remaining" sourceSheet.Cells(rowIndex, "H") = tableRow.Cells(1).innerText Case "home goals" sourceSheet.Cells(rowIndex, "C") = tableRow.Cells(2).innerText Case "away goals" sourceSheet.Cells(rowIndex, "E") = tableRow.Cells(2).innerText End Select Next tableRow Set htmlTableExtracted = Nothing ' Prevent this iteration's result having effects on succeeding iterations End If Next rowIndex End Sub 中是否不应该包含“遥远的目标”?我假设“ A SCR AVG”中的“ A”代表“ Away”(因为“ H SCR AVG”中的“ H”似乎代表“ Home”)。因此,即使屏幕截图显示应该将它们写到E列中,我还是在列E上写了“离开目标”(或者可能我没有正确阅读)。