将HTML文档的javascript部分中的字段提取到表中?地理坐标

时间:2015-02-04 19:31:05

标签: javascript json excel-vba web-scraping vba

我有一个HTML文档,其中包含Javascript块中的地理信息。它是此网页的源代码:https://energy.ehawaii.gov/epd/public/energy-projects-map.html

这可以被视为地图,也可以视为列表。

我想要实现的是在Excel中使用该列表,但是使用" Latitude"和#34;经度"的字段。 Google地图标记指定了Javascript中的LatLng

如何使用VB之类的东西来处理HTML文件的源代码,并组织成一个包含以下字段/列的表:

  • 说明(来自<a ... title="such and such">
  • 技术(例如来自<p>Technology: Solar</p>
  • 纬度(来自google.maps.LatLng(latitude, longitude);
  • Longtitude(来自与纬度相同的代码行,但使用第二个变量)?

所有帮助表示赞赏!

2 个答案:

答案 0 :(得分:1)

尝试基于XMLHTTP请求的此VBScript解决方案。只需复制下面的代码,粘贴到文本文件,将其保存为.vbs并运行它。脚本尚未优化,所有请求都不是异步的,因此我的PC上大约需要40秒才能获取所有数据。

Option Explicit
Dim arrCells(), arrList, arrTmp, sRespHeaders, sRespText, arrSetHeaders, i, j, iTotal, oApp, oWB, oWS, oOutput

' Create output window
Output oOutput

' Get cookies
oOutput.write "Get cookies"
XmlHttpGet "https://energy.ehawaii.gov/epd/public/energy-projects-map.html", Array(), sRespHeaders, sRespText
ParseResponse "^Set-(Cookie): ([\S]*?=[\S]*?);[\s\S]*?$", sRespHeaders, arrSetHeaders

' Get project list
oOutput.write "Get project list"
arrList = Array()
XmlHttpGet "https://energy.ehawaii.gov/epd/public/energy-projects-list.json?sEcho=2&iColumns=5&sColumns=&iDisplayStart=1&iDisplayLength=0&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&mDataProp_4=4&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false&bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=&bRegex_2=false&bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=&bRegex_4=false&bSearchable_4=true&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1&bSortable_0=true&bSortable_1=true&bSortable_2=true&bSortable_3=true&bSortable_4=true", arrSetHeaders, "", sRespText
ParseProjects sRespText, arrList, iTotal
oOutput.write "Get project list: " & (UBound(arrList) + 1) & " of " & iTotal

' Rearrange to 2-dimensional array, get LatLng
ReDim arrCells(UBound(arrList), 8) ' Name, Technology, Island, Capacity, Location, RID, Type, Lat, Lng
For i = 0 To UBound(arrList)
    For j = 0 To 6
        arrCells(i, j) = arrList(i)(j)
    Next
    oOutput.write "Get LatLng: " & (i + 1) & " of " & iTotal
    arrTmp = RequestLatLng(arrList(i)(5))
    arrCells(i, 7) = arrTmp(0)
    arrCells(i, 8) = arrTmp(1)
Next

' Create Excel worksheet, output data
oOutput.write "Export to Excel"
Set oApp = CreateObject("Excel.Application")
oApp.Visible = True
Set oWB = oApp.Workbooks.Add(-4167) ' xlWBATWorksheet
Set oWS = oWB.Worksheets(1)
oWS.Range(oWS.Cells(1, 1), oWS.Cells(UBound(arrCells) + 1, 9)).Value = arrCells
oWS.Columns.AutoFit
oWB.Saved = True
oOutput.write "Completed"

Sub XmlHttpGet(sQuery, arrSetHeaders, sRespHeaders, sRespText)
    Dim arrHeader
    With CreateObject("MSXML2.ServerXMLHTTP")
        .SetOption 2, 13056 ' SXH_SERVER_CERT_IGNORE_ALL_SERVER_ERRORS
        .Open "GET", sQuery, False
        For Each arrHeader In arrSetHeaders
            .SetRequestHeader arrHeader(0), arrHeader(1)
        Next
        .Send ""
        sRespHeaders = .GetAllResponseHeaders
        sRespText = .ResponseText
    End With
End Sub

Sub ParseResponse(sPattern, sResponse, aData)
    Dim oMatch, aTmp, sSubMatch
    aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = True
        .MultiLine = True
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                aTmp = Array()
                For Each sSubMatch In oMatch.SubMatches
                    PushItem aTmp, sSubMatch
                Next
                PushItem aData, aTmp
            End If
        Next
    End With
End Sub

Sub PushItem(aList, vItem)
    ReDim Preserve aList(UBound(aList) + 1)
    aList(UBound(aList)) = vItem
End Sub

Sub ParseProjects(sJson, arrProj, iTotalRecords)
    Dim i, q
    With CreateObject("htmlfile")
        With .parentwindow
            .execscript ";", "jscript"
            .eval ("json = " & sJson & ";")
            iTotalRecords = CInt(.json.iTotalRecords)
            Do While .json.aaData.Length
                ReDim Preserve arrProj(UBound(arrProj) + 1)
                With .json.aaData.Shift()
                    arrProj(UBound(arrProj)) = Array(.Shift(), .Shift(), .Shift(), .Shift(), .Shift(), .Shift(), .Shift())
                End With
            Loop
        End With
    End With
End Sub

Function RequestLatLng(sRid)
    Dim sRespText, arrTmp, sTmp
    XmlHttpGet "https://energy.ehawaii.gov/epd/public/energy-project-details.html?rid=" & sRid, Array(), "", sRespText
    arrTmp = Split(sRespText, "google.maps.LatLng(")
    If UBound(arrTmp) >= 1 Then
        sTmp = arrTmp(1)
        arrTmp = Split(sTmp, "),")
        If UBound(arrTmp) >= 1 Then
            RequestLatLng = Split(arrTmp(0), ", ")
            Exit Function
        End If
    End If
    RequestLatLng = Array("#", "#")
End Function

Sub Output(oWnd)
    Set oWnd = ShowWindow("energy.ehawaii.gov", "", 354, 118)
End Sub

Function ShowWindow(sTitle, sBG, iWidth, iHeight)
    Set ShowWindow = CreateWindow()
    With ShowWindow
        With .document
            .title = sTitle
            .getElementsByTagName("head")(0).appendChild .createElement("style")
            .styleSheets(0).cssText = "* {font: 8pt tahoma; margin: 5px;}"
            .body.style.background = "buttonface"
            .body.style.backgroundRepeat = "no-repeat"
            .body.style.backgroundImage = "url(" & sBG & ")"
            .body.innerHTML = ""
        End With
        .resizeTo .screen.availWidth, .screen.availHeight
        .resizeTo iWidth + .screen.availWidth - .document.body.offsetWidth, iHeight + .screen.availHeight - .document.body.offsetHeight
        .moveTo CInt((.screen.availWidth - iWidth) / 2), CInt((.screen.availHeight - iHeight) / 2)
        .execScript "var handlers, thunks = {body_onunload: function() {handlers.WSHQuit()}};"
        Execute "Class clsHandlers: Public Sub WSHQuit(): WScript.Quit: End Sub: End Class"
        Set .handlers = New clsHandlers
        Set .document.body.onunload = .thunks.body_onunload
        .execScript "var write = function(t) {document.body.innerHTML = t};"
    End With
End Function

Function CreateWindow()
    ' source http://forum.script-coding.com/viewtopic.php?pid=75356#p75356
    Dim sSignature, oShellWnd, oProc
    On Error Resume Next
    sSignature = Left(CreateObject("Scriptlet.TypeLib").Guid, 38)
    Do
        Set oProc = CreateObject("WScript.Shell").Exec("mshta ""about:<head><script>moveTo(-32000,-32000);document.title=' '</script><hta:application id=app border=thick minimizebutton=no maximizebutton=no scroll=no showintaskbar=yes contextmenu=no selection=no innerborder=no icon=""""/><object id='shell' classid='clsid:8856F961-340A-11D0-A96B-00C04FD705A2'><param name=RegisterAsBrowser value=1></object><script>shell.putproperty('" & sSignature & "',document.parentWindow);</script></head>""")
        Do
            If oProc.Status > 0 Then Exit Do
            For Each oShellWnd In CreateObject("Shell.Application").Windows
                Set CreateWindow = oShellWnd.GetProperty(sSignature)
                If Err.Number = 0 Then Exit Function
                Err.Clear
            Next
        Loop
    Loop
End Function

答案 1 :(得分:0)

当然可以。但是在StackOverflow,我们解决的问题不是为现有解决方案编写代码。

但是我首先要分析这个链接: https://maps.googleapis.com/maps/api/js/ViewportInfoService.GetViewportInfo?1m6&1m2&1d12.821947129167481&2d-175.06889349440217&2m2&1d28.109653321636404&2d-140.54826660818202&2u7&4spl-PL&5e0&6sm%40290000000&7b0&8e0&9b0&callback=xdc._48sez1&token=119313

通过它你将获得带有拉特和多头的JSON。你可以使用很多解决方案。我亲自为Excel做了一个简单的Scrape HTML Add-In。你需要学习的只是正则表达式。