阿根廷超市网刮

时间:2017-02-12 13:05:37

标签: excel vba web web-scraping screen-scraping

我正试图从网站上抓取数据:

https://www.disco.com.ar/Comprar/Home.aspx#_atCategory=false&_atGrilla=true&_id=21063

通过 Excel 2013 中的宏,如实时价格,产品名称和图片。

我已经尝试过excel网页查询,但它不起作用。

有没有办法做到这一点?

2 个答案:

答案 0 :(得分:1)

有一个示例显示如何使用XHR和JSON解析从网站检索数据,它包含几个步骤。

  1. 检索数据。
  2. 我使用Chrome开发者工具网络标签了解了XHR。 我找到的大多数相关数据是POST XHR从https://www.disco.com.ar/Comprar/HomeService.aspx/ObtenerLimiteDeProductos

    返回的JSON字符串

    POST XHR https://www.disco.com.ar/Comprar/HomeService.aspx/ObtenerLimiteDeProductos

    如果没有cookie标题,POST XHR对我不起作用。因此,我必须首先添加额外的HEAD XHR来检索ASP.NET_SessionId cookie,用于控制cookie的服务器版本XMLHTTP。返回cookie的唯一响应标头是来自https://www.disco.com.ar/Login/PreHome.aspx

    的GET XHR

    GET XHR https://www.disco.com.ar/Login/PreHome.aspx

    1. 检索到的JSON字符串应该被解析两次,因为它包含第一个JSON的d属性中包含的第二个有效负载JSON。
    2. 将解析后的JSON对象转换为以二维数组形式呈现的表格形式。
    3. 将数组输出到工作表。您可以通过直接访问阵列来执行进一步处理。
    4. 对于以下网页:

      webpage

      我的输出如下:

      output

      将以下代码放入VBA Project标准模块:

      Option Explicit
      
      Sub GetData()
      
          Dim sCookie As String
          Dim sPayLoad As String
          Dim sCont As String
          Dim vJSON As Variant
          Dim sState As String
          Dim y As Long
          Dim sSection As Variant
          Dim aData()
          Dim aHeader()
      
          ' Get cookie from the site
          With CreateObject("MSXML2.ServerXMLHTTP")
              .Open "HEAD", "https://www.disco.com.ar/Login/PreHome.aspx", False
              .Send
              sCookie = .getAllResponseHeaders
          End With
          sCookie = Split(sCookie, "Set-Cookie: ", 2)(1)
          sCookie = Split(sCookie, ";", 2)(0)
          ' Retrieve JSON data
          sPayLoad = "{IdMenu:""21063"",textoBusqueda:"""", producto:"""", marca:"""", " & _
              "pager:"""", ordenamiento:0, precioDesde:"""", precioHasta:""""}"
          With CreateObject("MSXML2.ServerXMLHTTP")
              .Open "POST", "https://www.disco.com.ar/Comprar/HomeService.aspx/ObtenerArticulosPorDescripcionMarcaFamiliaLevex", False
              .SetRequestHeader "Accept", "application/json, text/javascript, */*; q=0.01"
              .SetRequestHeader "Content-Type", "application/json; charset=utf-8"
              .SetRequestHeader "Content-Length", Len(sPayLoad)
              .SetRequestHeader "Cookie", sCookie
              .Send CStr(sPayLoad)
              sCont = .responseText
          End With
          ' Parse JSON response
          JSON.Parse sCont, vJSON, sState
          sCont = vJSON.Item("d")
          JSON.Parse sCont, vJSON, sState
          ' Output tables
          Sheets(1).Cells.Delete
          y = 1
          For Each sSection In Array("Tipo", "Marca", "Precio", "ResultadosBusquedaLevex", "ArticulosSugereridos")
              JSON.ToArray vJSON.Item(sSection), aData, aHeader
              With Sheets(1)
                  .Cells(y, 1).Value = sSection
                  OutputArray .Cells(y + 1, 1), aHeader
                  Output2DArray .Cells(y + 2, 1), aData
                  .Cells.Columns.AutoFit
              End With
              y = y + UBound(aData, 1) + 4
          Next
      
      End Sub
      
      Sub OutputArray(oDstRng As Range, aCells As Variant)
      
          With oDstRng
              .Parent.Select
              With .Resize( _
                      1, _
                      UBound(aCells) - LBound(aCells) + 1)
                  .NumberFormat = "@"
                  .Value = aCells
              End With
          End With
      
      End Sub
      
      Sub Output2DArray(oDstRng As Range, aCells As Variant)
      
          With oDstRng
              .Parent.Select
              With .Resize( _
                      UBound(aCells, 1) - LBound(aCells, 1) + 1, _
                      UBound(aCells, 2) - LBound(aCells, 2) + 1)
                  .NumberFormat = "@"
                  .Value = aCells
              End With
          End With
      
      End Sub
      

      再创建一个标准模块,将其命名为JSON并将下面的代码放入其中,此代码提供JSON处理功能:

      Option Explicit
      
      Private sBuffer As String
      Private oTokens As Object
      Private oRegEx As Object
      Private bMatch As Boolean
      Private oChunks As Object
      Private oHeader As Object
      Private aData() As Variant
      Private i As Long
      
      Sub Parse(ByVal sSample As String, vJSON As Variant, sState As String)
      
          ' Backus–Naur form JSON parser implementation based on RegEx
          ' Input:
          ' sSample - source JSON string
          ' Output:
          ' vJson - created object or array to be returned as result
          ' sState - string Object|Array|Error depending on processing
      
          sBuffer = sSample
          Set oTokens = CreateObject("Scripting.Dictionary")
          Set oRegEx = CreateObject("VBScript.RegExp")
          With oRegEx ' Patterns based on specification http://www.json.org/
              .Global = True
              .MultiLine = True
              .IgnoreCase = True ' Unspecified True, False, Null accepted
              .Pattern = "(?:'[^']*'|""(?:\\""|[^""])*"")(?=\s*[,\:\]\}])" ' Double-quoted string, unspecified quoted string
              Tokenize "s"
              .Pattern = "[+-]?(?:\d+\.\d*|\.\d+|\d+)(?:e[+-]?\d+)?(?=\s*[,\]\}])" ' Number, E notation number
              Tokenize "d"
              .Pattern = "\b(?:true|false|null)(?=\s*[,\]\}])" ' Constants true, false, null
              Tokenize "c"
              .Pattern = "\b[A-Za-z_]\w*(?=\s*\:)" ' Unspecified non-double-quoted property name accepted
              Tokenize "n"
              .Pattern = "\s+"
              sBuffer = .Replace(sBuffer, "") ' Remove unnecessary spaces
              .MultiLine = False
              Do
                  bMatch = False
                  .Pattern = "<\d+(?:[sn])>\:<\d+[codas]>" ' Object property structure
                  Tokenize "p"
                  .Pattern = "\{(?:<\d+p>(?:,<\d+p>)*)?\}" ' Object structure
                  Tokenize "o"
                  .Pattern = "\[(?:<\d+[codas]>(?:,<\d+[codas]>)*)?\]" ' Array structure
                  Tokenize "a"
              Loop While bMatch
              .Pattern = "^<\d+[oa]>$" ' Top level object structure, unspecified array accepted
              If .Test(sBuffer) And oTokens.Exists(sBuffer) Then
                  Retrieve sBuffer, vJSON
                  sState = IIf(IsObject(vJSON), "Object", "Array")
              Else
                  vJSON = Null
                  sState = "Error"
              End If
          End With
          Set oTokens = Nothing
          Set oRegEx = Nothing
      
      End Sub
      
      Private Sub Tokenize(sType)
      
          Dim aContent() As String
          Dim lCopyIndex As Long
          Dim i As Long
          Dim sKey As String
      
          With oRegEx.Execute(sBuffer)
              If .Count = 0 Then Exit Sub
              ReDim aContent(0 To .Count - 1)
              lCopyIndex = 1
              For i = 0 To .Count - 1
                  With .Item(i)
                      sKey = "<" & oTokens.Count & sType & ">"
                      oTokens(sKey) = .Value
                      aContent(i) = Mid(sBuffer, lCopyIndex, .FirstIndex - lCopyIndex + 1) & sKey
                      lCopyIndex = .FirstIndex + .Length + 1
                  End With
              Next
          End With
          sBuffer = Join(aContent, "") & Mid(sBuffer, lCopyIndex, Len(sBuffer) - lCopyIndex + 1)
          bMatch = True
      
      End Sub
      
      Private Sub Retrieve(sTokenKey, vTransfer)
      
          Dim sTokenValue As String
          Dim sName As String
          Dim vValue As Variant
          Dim aTokens() As String
          Dim i As Long
      
          sTokenValue = oTokens(sTokenKey)
          With oRegEx
              .Global = True
              Select Case Left(Right(sTokenKey, 2), 1)
                  Case "o"
                      Set vTransfer = CreateObject("Scripting.Dictionary")
                      aTokens = Split(sTokenValue, "<")
                      For i = 1 To UBound(aTokens)
                          Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vTransfer
                      Next
                  Case "p"
                      aTokens = Split(sTokenValue, "<", 4)
                      Retrieve "<" & Split(aTokens(1), ">", 2)(0) & ">", sName
                      Retrieve "<" & Split(aTokens(2), ">", 2)(0) & ">", vValue
                      If IsObject(vValue) Then
                          Set vTransfer(sName) = vValue
                      Else
                          vTransfer(sName) = vValue
                      End If
                  Case "a"
                      aTokens = Split(sTokenValue, "<")
                      If UBound(aTokens) = 0 Then
                          vTransfer = Array()
                      Else
                          ReDim vTransfer(0 To UBound(aTokens) - 1)
                          For i = 1 To UBound(aTokens)
                              Retrieve "<" & Split(aTokens(i), ">", 2)(0) & ">", vValue
                              If IsObject(vValue) Then
                                  Set vTransfer(i - 1) = vValue
                              Else
                                  vTransfer(i - 1) = vValue
                              End If
                          Next
                      End If
                  Case "n"
                      vTransfer = sTokenValue
                  Case "s"
                      vTransfer = Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace( _
                          Mid(sTokenValue, 2, Len(sTokenValue) - 2), _
                          "\""", """"), _
                          "\\", "\"), _
                          "\/", "/"), _
                          "\b", Chr(8)), _
                          "\f", Chr(12)), _
                          "\n", vbLf), _
                          "\r", vbCr), _
                          "\t", vbTab)
                      .Global = False
                      .Pattern = "\\u[0-9a-fA-F]{4}"
                      Do While .Test(vTransfer)
                          vTransfer = .Replace(vTransfer, ChrW(("&H" & Right(.Execute(vTransfer)(0).Value, 4)) * 1))
                      Loop
                  Case "d"
                      vTransfer = Evaluate(sTokenValue)
                  Case "c"
                      Select Case LCase(sTokenValue)
                          Case "true"
                              vTransfer = True
                          Case "false"
                              vTransfer = False
                          Case "null"
                              vTransfer = Null
                      End Select
              End Select
          End With
      
      End Sub
      
      Function Serialize(vJSON As Variant) As String
      
          Set oChunks = CreateObject("Scripting.Dictionary")
          SerializeElement vJSON, ""
          Serialize = Join(oChunks.Items(), "")
          Set oChunks = Nothing
      
      End Function
      
      Private Sub SerializeElement(vElement As Variant, ByVal sIndent As String)
      
          Dim aKeys() As Variant
          Dim i As Long
      
          With oChunks
              Select Case VarType(vElement)
                  Case vbObject
                      If vElement.Count = 0 Then
                          .Item(.Count) = "{}"
                      Else
                          .Item(.Count) = "{" & vbCrLf
                          aKeys = vElement.Keys
                          For i = 0 To UBound(aKeys)
                              .Item(.Count) = sIndent & vbTab & """" & aKeys(i) & """" & ": "
                              SerializeElement vElement(aKeys(i)), sIndent & vbTab
                              If Not (i = UBound(aKeys)) Then .Item(.Count) = ","
                              .Item(.Count) = vbCrLf
                          Next
                          .Item(.Count) = sIndent & "}"
                      End If
                  Case Is >= vbArray
                      If UBound(vElement) = -1 Then
                          .Item(.Count) = "[]"
                      Else
                          .Item(.Count) = "[" & vbCrLf
                          For i = 0 To UBound(vElement)
                              .Item(.Count) = sIndent & vbTab
                              SerializeElement vElement(i), sIndent & vbTab
                              If Not (i = UBound(vElement)) Then .Item(.Count) = "," 'sResult = sResult & ","
                              .Item(.Count) = vbCrLf
                          Next
                          .Item(.Count) = sIndent & "]"
                      End If
                  Case vbInteger, vbLong
                      .Item(.Count) = vElement
                  Case vbSingle, vbDouble
                      .Item(.Count) = Replace(vElement, ",", ".")
                  Case vbNull
                      .Item(.Count) = "null"
                  Case vbBoolean
                      .Item(.Count) = IIf(vElement, "true", "false")
                  Case Else
                      .Item(.Count) = """" & _
                          Replace(Replace(Replace(Replace(Replace(Replace(Replace(Replace(vElement, _
                              "\", "\\"), _
                              """", "\"""), _
                              "/", "\/"), _
                              Chr(8), "\b"), _
                              Chr(12), "\f"), _
                              vbLf, "\n"), _
                              vbCr, "\r"), _
                              vbTab, "\t") & _
                          """"
              End Select
          End With
      
      End Sub
      
      Function ToString(vJSON As Variant) As String
      
          Select Case VarType(vJSON)
              Case vbObject, Is >= vbArray
                  Set oChunks = CreateObject("Scripting.Dictionary")
                  ToStringElement vJSON, ""
                  oChunks.Remove 0
                  ToString = Join(oChunks.Items(), "")
                  Set oChunks = Nothing
              Case vbNull
                  ToString = "Null"
              Case vbBoolean
                  ToString = IIf(vJSON, "True", "False")
              Case Else
                  ToString = CStr(vJSON)
          End Select
      
      End Function
      
      Private Sub ToStringElement(vElement As Variant, ByVal sIndent As String)
      
          Dim aKeys() As Variant
          Dim i As Long
      
          With oChunks
              Select Case VarType(vElement)
                  Case vbObject
                      If vElement.Count = 0 Then
                          .Item(.Count) = "''"
                      Else
                          .Item(.Count) = vbCrLf
                          aKeys = vElement.Keys
                          For i = 0 To UBound(aKeys)
                              .Item(.Count) = sIndent & aKeys(i) & ": "
                              ToStringElement vElement(aKeys(i)), sIndent & vbTab
                              If Not (i = UBound(aKeys)) Then .Item(.Count) = vbCrLf
                          Next
                      End If
                  Case Is >= vbArray
                      If UBound(vElement) = -1 Then
                          .Item(.Count) = "''"
                      Else
                          .Item(.Count) = vbCrLf
                          For i = 0 To UBound(vElement)
                              .Item(.Count) = sIndent & i & ": "
                              ToStringElement vElement(i), sIndent & vbTab
                              If Not (i = UBound(vElement)) Then .Item(.Count) = vbCrLf
                          Next
                      End If
                  Case vbNull
                      .Item(.Count) = "Null"
                  Case vbBoolean
                      .Item(.Count) = IIf(vElement, "True", "False")
                  Case Else
                      .Item(.Count) = CStr(vElement)
              End Select
          End With
      
      End Sub
      
      Sub ToArray(vJSON As Variant, aRows() As Variant, aHeader() As Variant)
      
          ' Input:
          ' vJSON - Array or Object which contains rows data
          ' Output:
          ' aData - 2d array representing JSON data
          ' aHeader - 1d array of property names
      
          Dim sName As Variant
      
          Set oHeader = CreateObject("Scripting.Dictionary")
          Select Case VarType(vJSON)
              Case vbObject
                  If vJSON.Count > 0 Then
                      ReDim aData(0 To vJSON.Count - 1, 0 To 0)
                      oHeader("#") = 0
                      i = 0
                      For Each sName In vJSON
                          aData(i, 0) = "#" & sName
                          ToArrayElement vJSON(sName), ""
                          i = i + 1
                      Next
                  Else
                      ReDim aData(0 To 0, 0 To 0)
                  End If
              Case Is >= vbArray
                  If UBound(vJSON) >= 0 Then
                      ReDim aData(0 To UBound(vJSON), 0 To 0)
                      For i = 0 To UBound(vJSON)
                          ToArrayElement vJSON(i), ""
                      Next
                  Else
                      ReDim aData(0 To 0, 0 To 0)
                  End If
              Case Else
                  ReDim aData(0 To 0, 0 To 0)
                  aData(0, 0) = ToString(vJSON)
          End Select
          aHeader = oHeader.Keys()
          Set oHeader = Nothing
          aRows = aData
          Erase aData
      
      End Sub
      
      Private Sub ToArrayElement(vElement As Variant, sFieldName As String)
      
          Dim sName As Variant
          Dim j As Long
      
          Select Case VarType(vElement)
              Case vbObject ' collection of objects
                  For Each sName In vElement
                      ToArrayElement vElement(sName), sFieldName & IIf(sFieldName = "", "", "_") & sName
                  Next
              Case Is >= vbArray  ' collection of arrays
                  For j = 0 To UBound(vElement)
                      ToArrayElement vElement(j), sFieldName & IIf(sFieldName = "", "", "_") & "#" & j
                  Next
              Case Else
                  If Not oHeader.Exists(sFieldName) Then
                      oHeader(sFieldName) = oHeader.Count
                      If UBound(aData, 2) < oHeader.Count - 1 Then ReDim Preserve aData(0 To UBound(aData, 1), 0 To oHeader.Count - 1)
                  End If
                  j = oHeader(sFieldName)
                  aData(i, j) = ToString(vElement)
          End Select
      
      End Sub
      

      检查VBA-JSON-parser on GitHub以获取最新版本的JSON解析器(将JSON.bas模块导入VBA项目以进行JSON处理)。

答案 1 :(得分:1)

我建议你在 Python 中使用 Selenium。 配置需要一段时间,但一旦完成,您将拥有完成这项工作的完美工具。 它可以让您使用简单的 Python 语法、使用您想要的网络浏览器(Firefox 或 Chrome)、阅读 javascript 并与 javascript 交互来抓取您需要的任何网站。 我每天都使用它。