如何控制/解析位于HTML标头上方的数组

时间:2018-07-31 03:20:12

标签: html excel excel-vba web-scraping

很难解析找到的HTML。

响应头是: 内容类型:text / html 内容长度:28078

http://a810-bisweb.nyc.gov/bisweb/JobsQueryByLocationServlet?requestid=1&allbin=1054664

我尝试了以下方法:

获取html表并按行或节点或元素进行解析。所有这三个工作,有点。 html很尴尬,并且有随机空格,没有ID,只有类名。解析是一种痛苦。

我看到html上方是这张桌子,我不确定哪种数据格式。我如何获得该数据/解析它?

将我的代码放在html下面,忽略底部,我已经编码/分解了数十种不同的尝试,包括将text / html加载到xml中并使用它。

"<!--mod v2-->
<!--ALL INQUIRY TYPE: BXS1PRA3-->
<!--AllInquiryType :: BXS1PRA3
ECBIn :: 
AllBin :: 1054664
AllCount :: 0001
AllBorough :: 
BoilerNumber :: 
OpenObjectionFlag :: 
BiswebReporting :: 
InternalFlag :: 
BoroughKey :: I
StCodeKey :: 
BldgLoKey :: 
JobSubmDate :: 
AllNumbHous :: 
AllStrt :: 
AllBoroughName :: 
AllIsn :: 
PassWorkOrderNumber :: 
PassJobNumber :: 
PassDocNumber :: 
SeqNumber :: 
PPremise60 :: 
PassRecordNumber :: 
RqidPriorToA :: 
RqidPriorToP :: 
RqidPriorToS :: 
RqidItemStatusAll :: 
RqidItemStatusOpen :: 
RqidMultiLineFirst :: 
RqidMultiLineAll :: 
RqidIndex :: 
NotUsed :: 
StartFloorSc52 :: 
JobTypeDesc :: 
PassDeviceNumber :: 
NextIndex :: 
EMPTYPARAMCANBEUSED :: 
AllLicNoCurrent :: 
AllLicTypeCurrent :: 
Empty1 :: 
AllCnNumber :: 
AllCdNumber :: 
ApplNumOcv5 :: 
PageNumber :: 
PfKey :: 
AllEmailAddrCurrent :: 
Empty2 :: 
StartActiveSelect :: 
AllControlNumber :: 
AllStartDate :: 
AllEndDate :: 
AllJobType :: 
AllCommBd :: 
AllViolationType :: 
AllIsn2 :: 
AllTblType :: 
AllBlock :: 
AllLot :: 
AllTblCode :: 
TblBusinessName :: 
AllJAppProfTitle :: 
AllJAppLicNumber :: 
AllMetrixId :: 
InPassword :: 
InUserId :: 
NavFlag :: 
STypeOcv3 :: 
PtTempStatus :: 
PtOtherAuthApproval :: 
PtOtherAuthSig :: 
FillerData :: 
PassTempJobNumber :: 
AllKey1 :: 
AllKey2 :: 
AllFilterLarge :: 
AllFileId :: 
AllMemoType :: 
AllNumOfDataLines :: 
ReadSw :: D
FinFlag :: 
VbLoginId :: 
SustainableFlag :: 
-->
<!--Fin :: 0
ErrorMsg :: 
MoreErrors :: 
MFErrorArray ::  ARRAY[2 * 120]
[1]
    [0:ErrorCode]{ }
    [1:ArrayIndex]{ }
MFErrorArray2 ::  ARRAY[3 * 60]
[1]
    [0:ErrorCode2]{ }
    [1:Substitution]{ }
    [2:ArrayIndex2]{ }
NotUsed :: 
AllControlNumber :: 07/30/1
Datu :: 8
Pgm :: BXS1PRA3
VlNumbHous :: 2421
NmStrt :: 2 AVENUE
NmBoro :: MANHATTAN
VlBin :: 1054664
VlNumbZip :: 10035
VlTaxBlock :: 01789
VlTaxLot :: 00024
VlCensTract :: 242
VlHlthArea :: 1700
HseLo :: 
HseHi :: 
GlJobType :: 
GlPageN :: 0001
GlRecCountN :: 0000000008
FoilIndicator :: 
GlMax :: 
DebugMsg :: 
BoroughName :: 
NumbHous :: 
Strt :: 
TransactionExecuted :: BXS1PRA3
Lines ::  ARRAY[22 * 40]
[1]
    [0:Pra3Isn]{0000564806}
    [1:Fd]{12062006}
    [2:Job]{104619478}
    [3:Ap]{01}
    [4:JobType]{A3}
    [5:Demo]{}
    [6:FlrInjq]{001}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{12112006}
    [11:Applicant]{DEL MAST}
    [12:Rep]{}
    [13:Jobdes]{INSTALLATION OF A SCAFFOLD 16&#039; LONG X 38&#039; HIGH ON THE EXISTING SIDEWALK SH}
    [14:JAppLicNumberDisp]{OT}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[2]
    [0:Pra3Isn]{0000555722}
    [1:Fd]{09212006}
    [2:Job]{104550629}
    [3:Ap]{01}
    [4:JobType]{A2}
    [5:Demo]{}
    [6:FlrInjq]{001,002,003,004,005}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{09212006}
    [11:Applicant]{Shapiro}
    [12:Rep]{}
    [13:Jobdes]{Filing herewith to make building structurally stable inconjunction with de}
    [14:JAppLicNumberDisp]{0060597 PE}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[3]
    [0:Pra3Isn]{0000520307}
    [1:Fd]{02092006}
    [2:Job]{104294096}
    [3:Ap]{01}
    [4:JobType]{DM}
    [5:Demo]{}
    [6:FlrInjq]{001}
    [7:Gas]{}
    [8:Js]{X}
    [9:Jobstatus]{SIGNED OFF}
    [10:Dt]{02022007}
    [11:Applicant]{JACOBSON}
    [12:Rep]{}
    [13:Jobdes]{}
    [14:JAppLicNumberDisp]{1788510 RA}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[4]
    [0:Pra3Isn]{0000462054}
    [1:Fd]{07192004}
    [2:Job]{103835735}
    [3:Ap]{01}
    [4:JobType]{A3}
    [5:Demo]{}
    [6:FlrInjq]{GRD}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{08192005}
    [11:Applicant]{SINGH}
    [12:Rep]{}
    [13:Jobdes]{CONSTRUCTION OF 65 LF HEAVY DUTY SIDEWAL K SHED WITH NOSTORAGE  AS PER P}
    [14:JAppLicNumberDisp]{}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[5]
    [0:Pra3Isn]{0000184027}
    [1:Fd]{06121997}
    [2:Job]{101534190}
    [3:Ap]{01}
    [4:JobType]{A3}
    [5:Demo]{}
    [6:FlrInjq]{                         001 thru 005}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{07011997}
    [11:Applicant]{KO}
    [12:Rep]{}
    [13:Jobdes]{FIRE ESCAPES TO REPLACE EXISTING FIRE BALCONY.}
    [14:JAppLicNumberDisp]{0011493 RA}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[6]
    [0:Pra3Isn]{0000010982}
    [1:Fd]{10041990}
    [2:Job]{100121823}
    [3:Ap]{01}
    [4:JobType]{A2}
    [5:Demo]{}
    [6:FlrInjq]{1,  CLR}
    [7:Gas]{}
    [8:Js]{P}
    [9:Jobstatus]{APPROVED}
    [10:Dt]{10121990}
    [11:Applicant]{ESHKAR}
    [12:Rep]{}
    [13:Jobdes]{WORK AT NEW LAUNDROMAT ON FIRST FLOOR. N EW WASHERS, DRYERS, NEW HUNG}
    [14:JAppLicNumberDisp]{0018190 RA}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[7]
    [0:Pra3Isn]{0000010981}
    [1:Fd]{10041990}
    [2:Job]{100121814}
    [3:Ap]{01}
    [4:JobType]{A2}
    [5:Demo]{}
    [6:FlrInjq]{1}
    [7:Gas]{}
    [8:Js]{J}
    [9:Jobstatus]{P/E DISAPPROVED}
    [10:Dt]{05091991}
    [11:Applicant]{ESHKAR}
    [12:Rep]{}
    [13:Jobdes]{INSTALL ANSUL SYSTEM AT RESTAURANT. INST ALL SPRINKLERS AT DRYERS AT}
    [14:JAppLicNumberDisp]{0018190 RA}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
[8]
    [0:Pra3Isn]{0000006469}
    [1:Fd]{07231990}
    [2:Job]{100079852}
    [3:Ap]{01}
    [4:JobType]{A3}
    [5:Demo]{}
    [6:FlrInjq]{GRD}
    [7:Gas]{}
    [8:Js]{R}
    [9:Jobstatus]{PERMIT-ENTIRE}
    [10:Dt]{05151991}
    [11:Applicant]{JONES}
    [12:Rep]{}
    [13:Jobdes]{ERECT 27 FEET OF SIDEWALK SHED FOR REPAI R OF FACADE. NO CHANGE IN USE}
    [14:JAppLicNumberDisp]{}
    [15:JAuditCodeFlag]{}
    [16:DiagramFlag]{N}
    [17:ZoningDiagramStatus]{N}
    [18:ZoningDiagramRecDate]{}
    [19:DocType]{IF}
    [20:FoundationAppDate]{}
    [21:Bin]{1054664}
-->


<html> 
<head>
    <title>Job Overview</title>
    <link rel=""stylesheet"" type=""text/css"" href=""bsqpm.css"" media=""screen"">
    <link rel=""stylesheet"" type""text/css"" href=""print.css"" media=""print"">
    <link rel=""shortcut icon"" href=""/favicon.ico"" type=""image/x-icon"" />
    <script language=""javascript"" src=""bis_lib.js""></script>
    <script language=""javascript"" src=""sorttable.js""></script>
    <script language=""javascript"">
    function $(eln)
    {
        return document.getElementById(eln);
    }
    </script>
        <script language=""javascript"">
    <!--
        function page(loc, ref)
        {
            //Commented out the usagelog creation for Caching on 4-30-15
            //var ce = (document.cookie == null || document.cookie == """") ? ""n"" : ""y"";
            //var u = 'Log/img.gif?m=pg&url='+escape(loc)+'&ref='+escape(ref)+'&ra='+Math.round(Math.random()*100000)+'&ce='+ce;
            //(new Image(1,1)).src = u;
        }

        //onclick=""page('/path/place', document.location);""

                var _gaq = _gaq || [];
                _gaq.push(['_setAccount', 'UA-16591777-1']);
                _gaq.push(['_trackPageview']);

                (function() {
                        var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
                        ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
                        var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
                })();

    //-->
    </script>

</head>


<body bgcolor=""#ffffff"" leftmargin=""0"" topmargin=""0"" marginheight=""0"" marginwidth=""0"" bgproperties=""fixed"">
<center>
    <table border=""0"" cellpadding=""3"" cellspacing=""0"" width=""100%"">  <!--- Start of Header --->
        <tr>
            <td class=""header"" align=""center"" colspan=""3"" width=""100%"">
            <DIV class=""noprint"">
                <table border=""0"" cellpadding=""0"" cellspacing=""0"" width=""100%"">
                    <tr>
                        <td class=""header"">
                &nbsp;&nbsp;<a href='bsqpm01.jsp'>BIS Menu</a>&nbsp;&nbsp;|&nbsp;&nbsp;Applications
                        </td>
                        <td class=""header2"" align=""right"">&nbsp;&nbsp;<a href=""http://www1.nyc.gov/site/buildings/homeowner/homeowner-faqs.page"">FAQs</a>&nbsp;&nbsp;|&nbsp;&nbsp;<a href=""http://www1.nyc.gov/site/buildings/about/acronym-glossary.page"">Glossary</a>
                &nbsp;&nbsp;
                            <script language=""javascript"">
                            <!---
                                print_today();
                            //--->
                            </script>
                                <script language=""javascript"">
    <!--
        function page(loc, ref)
        {
            //Commented out the usagelog creation for Caching on 4-30-15
            //var ce = (document.cookie == null || document.cookie == """") ? ""n"" : ""y"";
            //var u = 'Log/img.gif?m=pg&url='+escape(loc)+'&ref='+escape(ref)+'&ra='+Math.round(Math.random()*100000)+'&ce='+ce;
            //(new Image(1,1)).src = u;
        }

        //onclick=""page('/path/place', document.location);""

                var _gaq = _gaq || [];
                _gaq.push(['_setAccount', 'UA-16591777-1']);
                _gaq.push(['_trackPageview']);

                (function() {
                        var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
                        ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
                        var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
                })();

    //-->
    </script>

                        </td>
                    </tr>
                </table>
            </DIV>
            </td>
        </tr>
        <tr>
            <td colspan=2><a href=""http://www1.nyc.gov/site/buildings/index.page""><img src=""./images/doblogo_1.jpg"" alt=""DOB Logo - Link to Homepage"" border=""0""></a></td>  <!--- Cell for DOB Logo --->
            <td align=""right""><a href=""http://www1.nyc.gov/""><img src=""./images/nyclogo.gif"" alt=""NYC.gov Logo - Link to Homepage"" border=""0""></a>
                <br>
                <a href=""https://www.nyc.gov/portal/site/nycgov/menuitem.63099911d804683c09416f1076a09da0/"" onclick=""javascript:page('/ext/signupnews', document.location);"">
                    <img border=""0"" src=""images/clckhere.gif"" onmouseover=""this.src='images/clckhere_over.gif';"" onmouseout=""this.src='images/clckhere.gif';"">
                </a>
            </td>
        </tr>
        <tr>
            <td colspan=3 class=""nychdg"" align=""center""><b>NYC Department of Buildings</b></td>
        </tr>
        <tr>
            <td colspan=3 class=""mainhdg"" align=""center"">Job Overview</td>
        </tr>

    </table>        <!--- End of Header --->


<!---Start Message --->

代码:

Sub getAndParse()
    Dim bin As String
        bin = 1054664

    Dim URLOne As String
        URLOne = "http://a810-bisweb.nyc.gov/bisweb/JobsQueryByLocationServlet?requestid=1&allbin=" & bin

    Dim xmlOne As MSXML2.XMLHTTP60
        Set xmlOne = New MSXML2.XMLHTTP60

    Dim htmlOne As Object
        Set htmlOne = CreateObject("htmlfile")

    With xmlOne
        .Open "GET", URLOne
        .setRequestHeader "Content-Type", "application/xml"
        .send
    End With

    With xmlOne
        While Not .readyState = 4
            Application.Wait Now + TimeValue("0:00:01")
        Wend
        If .Status = 200 Then
            While InStr(1, .responseText, "Updating", 0) > 0
                Application.Wait Now + TimeValue("0:00:01")
            Wend
                htmlOne.body.innerHTML = .responseText
        Else
            MsgBox "Connection Unable To Be Made, Try Again"
            Exit Sub
        End If
    End With


    Debug.Print xmlOne.getAllResponseHeaders
    Stop
    [A1] = xmlOne.responseText

    For Each nde In htmlOne.body.Children
        Debug.Print nde.innerText
    Next nde

    Dim tblRow As MSHTML.HTMLTableRow

    For Each tblRow In tbl.Rows
        Debug.Print tblRow.innerText
    Next tblRow
End Sub

1 个答案:

答案 0 :(得分:1)

并不理想,但是通过简单地复制粘贴就可以避免中间表格式化的困难。非常感谢您可能需要使用“数组”信息的更有条理的方法。

Option Explicit
Public Sub GetInfo()
    Dim html As HTMLDocument, hTable As HTMLTable, clipboard As Object, xmlHttp As Object
    Set xmlHttp = CreateObject("MSXML2.XMLHTTP.6.0")
    Const URL As String = "http://a810-bisweb.nyc.gov/bisweb/JobsQueryByLocationServlet?requestid=1&allbin=1054664"
    Application.ScreenUpdating = False
    Set html = GetHTMLDoc(URL, xmlHttp)
    With html
        Set hTable = .getElementsByTagName("table")(5)
        Set clipboard = New MSForms.DataObject
        clipboard.SetText hTable.outerHTML
        clipboard.PutInClipboard
        ActiveSheet.Cells(1, 1).PasteSpecial
    End With
    Application.ScreenUpdating = True
End Sub

Public Function GetHTMLDoc(ByVal URL As String, ByRef xmlHttp As Object) As HTMLDocument
    With xmlHttp
        .Open "GET", URL, False
        .setRequestHeader "Content-Type", "text/xml"
        .send
        Dim html As HTMLDocument
        Set html = New HTMLDocument
        html.body.innerHTML = .responseText
    End With
    Set GetHTMLDoc = html
End Function

参考:

  1. HTML对象库
  2. Microsoft Forms 2.0对象库