Excel VBA宏:从跨越多个页面的站点表中截取​​数据

时间:2014-10-30 22:46:43

标签: excel vba excel-vba web-scraping

提前感谢您的帮助。我正在运行Windows 8.1,我有最新的IE / Chrome浏览器和最新的Excel。我试图编写一个从StackOverflow(https://stackoverflow.com/tags)中提取数据的Excel宏。具体来说,我试图提取日期(运行宏),标签名称,标签数量以及标签的简要说明。我让它适用于表格的第一页,但不适用于其他页面(目前有1132页)。现在,它每次运行宏时都会覆盖数据,而且我不确定如何在运行之前让它查找下一个空单元格。最后,我试图让它自动运行一次周。

我非常感谢这里的任何帮助。问题是:

  1. 从第一页以外的网络表中提取数据
  2. 将数据刮到下一个空行而不是覆盖
  3. 使宏每周自动运行一次
  4. 代码(目前为止)如下。谢谢!

    Enum READYSTATE
    READYSTATE_UNINITIALIZED = 0
    READYSTATE_LOADING = 1
    READYSTATE_LOADED = 2
    READYSTATE_INTERACTIVE = 3
    READYSTATE_COMPLETE = 4
    End Enum
    
    Sub ImportStackOverflowData()
      'to refer to the running copy of Internet Explorer
      Dim ie As InternetExplorer
      'to refer to the HTML document returned
      Dim html As HTMLDocument
      'open Internet Explorer in memory, and go to website
      Set ie = New InternetExplorer
      ie.Visible = False
      ie.navigate "http://stackoverflow.com/tags"
    
      'Wait until IE is done loading page
      Do While ie.READYSTATE <> READYSTATE_COMPLETE
        Application.StatusBar = "Trying to go to StackOverflow ..."
        DoEvents
      Loop
    
      'show text of HTML document returned
      Set html = ie.document
    
      'close down IE and reset status bar
      Set ie = Nothing
      Application.StatusBar = ""
    
      'clear old data out and put titles in
      'Cells.Clear
      'put heading across the top of row 3
      Range("A3").Value = "Date Pulled"
      Range("B3").Value = "Keyword"
      Range("C3").Value = "# Of Tags"
      'Range("C3").Value = "Asked This Week"
      Range("D3").Value = "Description"
    
      Dim TagList As IHTMLElement
      Dim Tags As IHTMLElementCollection
      Dim Tag As IHTMLElement
      Dim RowNumber As Long
      Dim TagFields As IHTMLElementCollection
      Dim TagField As IHTMLElement
      Dim Keyword As String
      Dim NumberOfTags As String
      'Dim AskedThisWeek As String
      Dim TagDescription As String
      'Dim QuestionFieldLinks As IHTMLElementCollection
      Dim TodaysDate As Date
    
      Set TagList = html.getElementById("tags-browser")
      Set Tags = html.getElementsByClassName("tag-cell")
      RowNumber = 4
    
      For Each Tag In Tags
        'if this is the tag containing the details, process it
        If Tag.className = "tag-cell" Then
          'get a list of all of the parts of this question,
          'and loop over them
          Set TagFields = Tag.all
    
          For Each TagField In TagFields
            'if this is the keyword, store it
            If TagField.className = "post-tag" Then
              'store the text value
              Keyword = TagField.innerText
              Cells(RowNumber, 2).Value = TagField.innerText
            End If
    
            If TagField.className = "item-multiplier-count" Then
              'store the integer for number of tags
              NumberOfTags = TagField.innerText
              'NumberOfTags = Replace(NumberOfTags, "x", "")
              Cells(RowNumber, 3).Value = Trim(NumberOfTags)
            End If
    
            If TagField.className = "excerpt" Then
              Description = TagField.innerText
              Cells(RowNumber, 4).Value = TagField.innerText
            End If
    
            TodaysDate = Format(Now, "MM/dd/yy")
            Cells(RowNumber, 1).Value = TodaysDate
    
          Next TagField
    
          'go on to next row of worksheet
          RowNumber = RowNumber + 1
        End If
      Next
    
      Set html = Nothing
    
      'do some final formatting
      Range("A3").CurrentRegion.WrapText = False
      Range("A3").CurrentRegion.EntireColumn.AutoFit
      Range("A1:C1").EntireColumn.HorizontalAlignment = xlCenter
      Range("A1:D1").Merge
      Range("A1").Value = "StackOverflow Tag Trends"
      Range("A1").Font.Bold = True
      Application.StatusBar = ""
      MsgBox "Done!"
    End Sub
    

3 个答案:

答案 0 :(得分:1)

当他们通过数据资源管理器等方式为您提供基础数据时,无需刮掉Stack Overflow。在数据资源管理器中使用此查询可以获得所需的结果:

select t.TagName, t.Count, p.Body
 from Tags t inner join Posts p
 on t.ExcerptPostId = p.Id
 order by t.count desc;

该查询的固定链接是here和&#34;下载CSV&#34;查询运行后出现的选项可能是将数据导入Excel的最简单方法。如果您想自动执行这部分内容,则直接链接到结果的CSV下载here

答案 1 :(得分:1)

您可以改进它以解析出确切的元素,但是它会循环所有页面并获取所有标签信息(标签旁边的所有内容)

Option Explicit

Public Sub ImportStackOverflowData()

    Dim ie As New InternetExplorer, html As HTMLDocument

    Application.ScreenUpdating = False
    With ie
        .Visible = True

        .navigate "https://stackoverflow.com/tags"

        While .Busy Or .READYSTATE < 4: DoEvents: Wend

        Set html = .document
        Dim numPages As Long, i As Long, info As Object, item As Object, counter As Long
        numPages = html.querySelector(".page-numbers.dots ~ a").innerText

        For i = 1 To 2  ' numPages ''<==1 to 2 for testing; use to numPages
            DoEvents
            Set info = html.getElementById("tags_list")
            For Each item In info.getElementsByClassName("grid-layout--cell tag-cell")
                counter = counter + 1
                Cells(counter, 1) = item.innerText
            Next item
            html.querySelector(".page-numbers.next").Click
            While .Busy Or .READYSTATE < 4: DoEvents: Wend
            Set html = .document
        Next i
        Application.ScreenUpdating = True
        .Quit '<== Remember to quit application
    End With
End Sub

答案 2 :(得分:0)

我没有使用DOM,但我发现只需在已知标签之间进行搜索就很容易。如果您正在寻找的表达式太常见,只需稍微调整一下代码,以便在字符串后查找字符串)。

一个例子:

Public Sub ZipLookUp()
Dim URL As String, xmlHTTP As Object, html As Object, htmlResponse As String
Dim SStr As String, EStr As String, EndS As Integer, StartS As Integer
Dim Zip4Digit As String

    URL = "https://tools.usps.com/go/ZipLookupResultsAction!input.action?resultMode=1&companyName=&address1=1642+Harmon+Street&address2=&city=Berkeley&state=CA&urbanCode=&postalCode=&zip=94703"
    Set xmlHTTP = CreateObject("MSXML2.XMLHTTP")
    xmlHTTP.Open "GET", URL, False
    On Error GoTo NoConnect
    xmlHTTP.send
    On Error GoTo 0
    Set html = CreateObject("htmlfile")
    htmlResponse = xmlHTTP.ResponseText
    If htmlResponse = Null Then
        MsgBox ("Aborted Run - HTML response was null")
        Application.ScreenUpdating = True
        GoTo End_Prog
    End If

    'Searching for a string within 2 strings
    SStr = "<span class=""address1 range"">" ' first string
    EStr = "</span><br />"                   ' second string
    StartS = InStr(1, htmlResponse, SStr, vbTextCompare) + Len(SStr)
    EndS = InStr(StartS, htmlResponse, EStr, vbTextCompare)
    Zip4Digit = Left(Mid(htmlResponse, StartS, EndS - StartS), 4)

    MsgBox Zip4Digit

GoTo End_Prog
NoConnect:
    If Err = -2147467259 Or Err = -2146697211 Then MsgBox "Error - No Connection": GoTo End_Prog 'MsgBox Err & ": " & Error(Err)
End_Prog:
End Sub