Question

提前感谢您的帮助。我正在运行Windows 8.1，我有最新的IE / Chrome浏览器和最新的Excel。我试图编写一个从StackOverflow（https://stackoverflow.com/tags）中提取数据的Excel宏。具体来说，我试图提取日期（运行宏），标签名称，标签数量以及标签的简要说明。我让它适用于表格的第一页，但不适用于其他页面（目前有1132页）。现在，它每次运行宏时都会覆盖数据，而且我不确定如何在运行之前让它查找下一个空单元格。最后，我试图让它自动运行一次周。

我非常感谢这里的任何帮助。问题是：

从第一页以外的网络表中提取数据
将数据刮到下一个空行而不是覆盖
使宏每周自动运行一次

代码（目前为止）如下。谢谢！

Enum READYSTATE
READYSTATE_UNINITIALIZED = 0
READYSTATE_LOADING = 1
READYSTATE_LOADED = 2
READYSTATE_INTERACTIVE = 3
READYSTATE_COMPLETE = 4
End Enum

Sub ImportStackOverflowData()
  'to refer to the running copy of Internet Explorer
  Dim ie As InternetExplorer
  'to refer to the HTML document returned
  Dim html As HTMLDocument
  'open Internet Explorer in memory, and go to website
  Set ie = New InternetExplorer
  ie.Visible = False
  ie.navigate "http://stackoverflow.com/tags"

  'Wait until IE is done loading page
  Do While ie.READYSTATE <> READYSTATE_COMPLETE
    Application.StatusBar = "Trying to go to StackOverflow ..."
    DoEvents
  Loop

  'show text of HTML document returned
  Set html = ie.document

  'close down IE and reset status bar
  Set ie = Nothing
  Application.StatusBar = ""

  'clear old data out and put titles in
  'Cells.Clear
  'put heading across the top of row 3
  Range("A3").Value = "Date Pulled"
  Range("B3").Value = "Keyword"
  Range("C3").Value = "# Of Tags"
  'Range("C3").Value = "Asked This Week"
  Range("D3").Value = "Description"

  Dim TagList As IHTMLElement
  Dim Tags As IHTMLElementCollection
  Dim Tag As IHTMLElement
  Dim RowNumber As Long
  Dim TagFields As IHTMLElementCollection
  Dim TagField As IHTMLElement
  Dim Keyword As String
  Dim NumberOfTags As String
  'Dim AskedThisWeek As String
  Dim TagDescription As String
  'Dim QuestionFieldLinks As IHTMLElementCollection
  Dim TodaysDate As Date

  Set TagList = html.getElementById("tags-browser")
  Set Tags = html.getElementsByClassName("tag-cell")
  RowNumber = 4

  For Each Tag In Tags
    'if this is the tag containing the details, process it
    If Tag.className = "tag-cell" Then
      'get a list of all of the parts of this question,
      'and loop over them
      Set TagFields = Tag.all

      For Each TagField In TagFields
        'if this is the keyword, store it
        If TagField.className = "post-tag" Then
          'store the text value
          Keyword = TagField.innerText
          Cells(RowNumber, 2).Value = TagField.innerText
        End If

        If TagField.className = "item-multiplier-count" Then
          'store the integer for number of tags
          NumberOfTags = TagField.innerText
          'NumberOfTags = Replace(NumberOfTags, "x", "")
          Cells(RowNumber, 3).Value = Trim(NumberOfTags)
        End If

        If TagField.className = "excerpt" Then
          Description = TagField.innerText
          Cells(RowNumber, 4).Value = TagField.innerText
        End If

        TodaysDate = Format(Now, "MM/dd/yy")
        Cells(RowNumber, 1).Value = TodaysDate

      Next TagField

      'go on to next row of worksheet
      RowNumber = RowNumber + 1
    End If
  Next

  Set html = Nothing

  'do some final formatting
  Range("A3").CurrentRegion.WrapText = False
  Range("A3").CurrentRegion.EntireColumn.AutoFit
  Range("A1:C1").EntireColumn.HorizontalAlignment = xlCenter
  Range("A1:D1").Merge
  Range("A1").Value = "StackOverflow Tag Trends"
  Range("A1").Font.Bold = True
  Application.StatusBar = ""
  MsgBox "Done!"
End Sub

Answer 1

当他们通过数据资源管理器等方式为您提供基础数据时，无需刮掉Stack Overflow。在数据资源管理器中使用此查询可以获得所需的结果：

select t.TagName, t.Count, p.Body
 from Tags t inner join Posts p
 on t.ExcerptPostId = p.Id
 order by t.count desc;

该查询的固定链接是here和＆＃34;下载CSV＆＃34;查询运行后出现的选项可能是将数据导入Excel的最简单方法。如果您想自动执行这部分内容，则直接链接到结果的CSV下载here

Answer 2

您可以改进它以解析出确切的元素，但是它会循环所有页面并获取所有标签信息（标签旁边的所有内容）

Option Explicit

Public Sub ImportStackOverflowData()

    Dim ie As New InternetExplorer, html As HTMLDocument

    Application.ScreenUpdating = False
    With ie
        .Visible = True

        .navigate "https://stackoverflow.com/tags"

        While .Busy Or .READYSTATE < 4: DoEvents: Wend

        Set html = .document
        Dim numPages As Long, i As Long, info As Object, item As Object, counter As Long
        numPages = html.querySelector(".page-numbers.dots ~ a").innerText

        For i = 1 To 2  ' numPages ''<==1 to 2 for testing; use to numPages
            DoEvents
            Set info = html.getElementById("tags_list")
            For Each item In info.getElementsByClassName("grid-layout--cell tag-cell")
                counter = counter + 1
                Cells(counter, 1) = item.innerText
            Next item
            html.querySelector(".page-numbers.next").Click
            While .Busy Or .READYSTATE < 4: DoEvents: Wend
            Set html = .document
        Next i
        Application.ScreenUpdating = True
        .Quit '<== Remember to quit application
    End With
End Sub

Answer 3

我没有使用DOM，但我发现只需在已知标签之间进行搜索就很容易。如果您正在寻找的表达式太常见，只需稍微调整一下代码，以便在字符串后查找字符串）。

一个例子：

Public Sub ZipLookUp()
Dim URL As String, xmlHTTP As Object, html As Object, htmlResponse As String
Dim SStr As String, EStr As String, EndS As Integer, StartS As Integer
Dim Zip4Digit As String

    URL = "https://tools.usps.com/go/ZipLookupResultsAction!input.action?resultMode=1&companyName=&address1=1642+Harmon+Street&address2=&city=Berkeley&state=CA&urbanCode=&postalCode=&zip=94703"
    Set xmlHTTP = CreateObject("MSXML2.XMLHTTP")
    xmlHTTP.Open "GET", URL, False
    On Error GoTo NoConnect
    xmlHTTP.send
    On Error GoTo 0
    Set html = CreateObject("htmlfile")
    htmlResponse = xmlHTTP.ResponseText
    If htmlResponse = Null Then
        MsgBox ("Aborted Run - HTML response was null")
        Application.ScreenUpdating = True
        GoTo End_Prog
    End If

    'Searching for a string within 2 strings
    SStr = "<span class=""address1 range"">" ' first string
    EStr = "</span><br />"                   ' second string
    StartS = InStr(1, htmlResponse, SStr, vbTextCompare) + Len(SStr)
    EndS = InStr(StartS, htmlResponse, EStr, vbTextCompare)
    Zip4Digit = Left(Mid(htmlResponse, StartS, EndS - StartS), 4)

    MsgBox Zip4Digit

GoTo End_Prog
NoConnect:
    If Err = -2147467259 Or Err = -2146697211 Then MsgBox "Error - No Connection": GoTo End_Prog 'MsgBox Err & ": " & Error(Err)
End_Prog:
End Sub

Excel VBA宏：从跨越多个页面的站点表中截取数据

3 个答案:

Excel VBA宏：从跨越多个页面的站点表中截取​​数据

3 个答案:

Excel VBA宏：从跨越多个页面的站点表中截取数据