很难解析找到的HTML。
响应头是: 内容类型:text / html 内容长度:28078
http://a810-bisweb.nyc.gov/bisweb/JobsQueryByLocationServlet?requestid=1&allbin=1054664
我尝试了以下方法:
获取html表并按行或节点或元素进行解析。所有这三个工作,有点。 html很尴尬,并且有随机空格,没有ID,只有类名。解析是一种痛苦。
我看到html上方是这张桌子,我不确定哪种数据格式。我如何获得该数据/解析它?
将我的代码放在html下面,忽略底部,我已经编码/分解了数十种不同的尝试,包括将text / html加载到xml中并使用它。
"<!--mod v2-->
<!--ALL INQUIRY TYPE: BXS1PRA3-->
<!--AllInquiryType :: BXS1PRA3
ECBIn ::
AllBin :: 1054664
AllCount :: 0001
AllBorough ::
BoilerNumber ::
OpenObjectionFlag ::
BiswebReporting ::
InternalFlag ::
BoroughKey :: I
StCodeKey ::
BldgLoKey ::
JobSubmDate ::
AllNumbHous ::
AllStrt ::
AllBoroughName ::
AllIsn ::
PassWorkOrderNumber ::
PassJobNumber ::
PassDocNumber ::
SeqNumber ::
PPremise60 ::
PassRecordNumber ::
RqidPriorToA ::
RqidPriorToP ::
RqidPriorToS ::
RqidItemStatusAll ::
RqidItemStatusOpen ::
RqidMultiLineFirst ::
RqidMultiLineAll ::
RqidIndex ::
NotUsed ::
StartFloorSc52 ::
JobTypeDesc ::
PassDeviceNumber ::
NextIndex ::
EMPTYPARAMCANBEUSED ::
AllLicNoCurrent ::
AllLicTypeCurrent ::
Empty1 ::
AllCnNumber ::
AllCdNumber ::
ApplNumOcv5 ::
PageNumber ::
PfKey ::
AllEmailAddrCurrent ::
Empty2 ::
StartActiveSelect ::
AllControlNumber ::
AllStartDate ::
AllEndDate ::
AllJobType ::
AllCommBd ::
AllViolationType ::
AllIsn2 ::
AllTblType ::
AllBlock ::
AllLot ::
AllTblCode ::
TblBusinessName ::
AllJAppProfTitle ::
AllJAppLicNumber ::
AllMetrixId ::
InPassword ::
InUserId ::
NavFlag ::
STypeOcv3 ::
PtTempStatus ::
PtOtherAuthApproval ::
PtOtherAuthSig ::
FillerData ::
PassTempJobNumber ::
AllKey1 ::
AllKey2 ::
AllFilterLarge ::
AllFileId ::
AllMemoType ::
AllNumOfDataLines ::
ReadSw :: D
FinFlag ::
VbLoginId ::
SustainableFlag ::
-->
<!--Fin :: 0
ErrorMsg ::
MoreErrors ::
MFErrorArray :: ARRAY[2 * 120]
[1]
[0:ErrorCode]{ }
[1:ArrayIndex]{ }
MFErrorArray2 :: ARRAY[3 * 60]
[1]
[0:ErrorCode2]{ }
[1:Substitution]{ }
[2:ArrayIndex2]{ }
NotUsed ::
AllControlNumber :: 07/30/1
Datu :: 8
Pgm :: BXS1PRA3
VlNumbHous :: 2421
NmStrt :: 2 AVENUE
NmBoro :: MANHATTAN
VlBin :: 1054664
VlNumbZip :: 10035
VlTaxBlock :: 01789
VlTaxLot :: 00024
VlCensTract :: 242
VlHlthArea :: 1700
HseLo ::
HseHi ::
GlJobType ::
GlPageN :: 0001
GlRecCountN :: 0000000008
FoilIndicator ::
GlMax ::
DebugMsg ::
BoroughName ::
NumbHous ::
Strt ::
TransactionExecuted :: BXS1PRA3
Lines :: ARRAY[22 * 40]
[1]
[0:Pra3Isn]{0000564806}
[1:Fd]{12062006}
[2:Job]{104619478}
[3:Ap]{01}
[4:JobType]{A3}
[5:Demo]{}
[6:FlrInjq]{001}
[7:Gas]{}
[8:Js]{R}
[9:Jobstatus]{PERMIT-ENTIRE}
[10:Dt]{12112006}
[11:Applicant]{DEL MAST}
[12:Rep]{}
[13:Jobdes]{INSTALLATION OF A SCAFFOLD 16' LONG X 38' HIGH ON THE EXISTING SIDEWALK SH}
[14:JAppLicNumberDisp]{OT}
[15:JAuditCodeFlag]{}
[16:DiagramFlag]{N}
[17:ZoningDiagramStatus]{N}
[18:ZoningDiagramRecDate]{}
[19:DocType]{IF}
[20:FoundationAppDate]{}
[21:Bin]{1054664}
[2]
[0:Pra3Isn]{0000555722}
[1:Fd]{09212006}
[2:Job]{104550629}
[3:Ap]{01}
[4:JobType]{A2}
[5:Demo]{}
[6:FlrInjq]{001,002,003,004,005}
[7:Gas]{}
[8:Js]{R}
[9:Jobstatus]{PERMIT-ENTIRE}
[10:Dt]{09212006}
[11:Applicant]{Shapiro}
[12:Rep]{}
[13:Jobdes]{Filing herewith to make building structurally stable inconjunction with de}
[14:JAppLicNumberDisp]{0060597 PE}
[15:JAuditCodeFlag]{}
[16:DiagramFlag]{N}
[17:ZoningDiagramStatus]{N}
[18:ZoningDiagramRecDate]{}
[19:DocType]{IF}
[20:FoundationAppDate]{}
[21:Bin]{1054664}
[3]
[0:Pra3Isn]{0000520307}
[1:Fd]{02092006}
[2:Job]{104294096}
[3:Ap]{01}
[4:JobType]{DM}
[5:Demo]{}
[6:FlrInjq]{001}
[7:Gas]{}
[8:Js]{X}
[9:Jobstatus]{SIGNED OFF}
[10:Dt]{02022007}
[11:Applicant]{JACOBSON}
[12:Rep]{}
[13:Jobdes]{}
[14:JAppLicNumberDisp]{1788510 RA}
[15:JAuditCodeFlag]{}
[16:DiagramFlag]{N}
[17:ZoningDiagramStatus]{N}
[18:ZoningDiagramRecDate]{}
[19:DocType]{IF}
[20:FoundationAppDate]{}
[21:Bin]{1054664}
[4]
[0:Pra3Isn]{0000462054}
[1:Fd]{07192004}
[2:Job]{103835735}
[3:Ap]{01}
[4:JobType]{A3}
[5:Demo]{}
[6:FlrInjq]{GRD}
[7:Gas]{}
[8:Js]{R}
[9:Jobstatus]{PERMIT-ENTIRE}
[10:Dt]{08192005}
[11:Applicant]{SINGH}
[12:Rep]{}
[13:Jobdes]{CONSTRUCTION OF 65 LF HEAVY DUTY SIDEWAL K SHED WITH NOSTORAGE AS PER P}
[14:JAppLicNumberDisp]{}
[15:JAuditCodeFlag]{}
[16:DiagramFlag]{N}
[17:ZoningDiagramStatus]{N}
[18:ZoningDiagramRecDate]{}
[19:DocType]{IF}
[20:FoundationAppDate]{}
[21:Bin]{1054664}
[5]
[0:Pra3Isn]{0000184027}
[1:Fd]{06121997}
[2:Job]{101534190}
[3:Ap]{01}
[4:JobType]{A3}
[5:Demo]{}
[6:FlrInjq]{ 001 thru 005}
[7:Gas]{}
[8:Js]{R}
[9:Jobstatus]{PERMIT-ENTIRE}
[10:Dt]{07011997}
[11:Applicant]{KO}
[12:Rep]{}
[13:Jobdes]{FIRE ESCAPES TO REPLACE EXISTING FIRE BALCONY.}
[14:JAppLicNumberDisp]{0011493 RA}
[15:JAuditCodeFlag]{}
[16:DiagramFlag]{N}
[17:ZoningDiagramStatus]{N}
[18:ZoningDiagramRecDate]{}
[19:DocType]{IF}
[20:FoundationAppDate]{}
[21:Bin]{1054664}
[6]
[0:Pra3Isn]{0000010982}
[1:Fd]{10041990}
[2:Job]{100121823}
[3:Ap]{01}
[4:JobType]{A2}
[5:Demo]{}
[6:FlrInjq]{1, CLR}
[7:Gas]{}
[8:Js]{P}
[9:Jobstatus]{APPROVED}
[10:Dt]{10121990}
[11:Applicant]{ESHKAR}
[12:Rep]{}
[13:Jobdes]{WORK AT NEW LAUNDROMAT ON FIRST FLOOR. N EW WASHERS, DRYERS, NEW HUNG}
[14:JAppLicNumberDisp]{0018190 RA}
[15:JAuditCodeFlag]{}
[16:DiagramFlag]{N}
[17:ZoningDiagramStatus]{N}
[18:ZoningDiagramRecDate]{}
[19:DocType]{IF}
[20:FoundationAppDate]{}
[21:Bin]{1054664}
[7]
[0:Pra3Isn]{0000010981}
[1:Fd]{10041990}
[2:Job]{100121814}
[3:Ap]{01}
[4:JobType]{A2}
[5:Demo]{}
[6:FlrInjq]{1}
[7:Gas]{}
[8:Js]{J}
[9:Jobstatus]{P/E DISAPPROVED}
[10:Dt]{05091991}
[11:Applicant]{ESHKAR}
[12:Rep]{}
[13:Jobdes]{INSTALL ANSUL SYSTEM AT RESTAURANT. INST ALL SPRINKLERS AT DRYERS AT}
[14:JAppLicNumberDisp]{0018190 RA}
[15:JAuditCodeFlag]{}
[16:DiagramFlag]{N}
[17:ZoningDiagramStatus]{N}
[18:ZoningDiagramRecDate]{}
[19:DocType]{IF}
[20:FoundationAppDate]{}
[21:Bin]{1054664}
[8]
[0:Pra3Isn]{0000006469}
[1:Fd]{07231990}
[2:Job]{100079852}
[3:Ap]{01}
[4:JobType]{A3}
[5:Demo]{}
[6:FlrInjq]{GRD}
[7:Gas]{}
[8:Js]{R}
[9:Jobstatus]{PERMIT-ENTIRE}
[10:Dt]{05151991}
[11:Applicant]{JONES}
[12:Rep]{}
[13:Jobdes]{ERECT 27 FEET OF SIDEWALK SHED FOR REPAI R OF FACADE. NO CHANGE IN USE}
[14:JAppLicNumberDisp]{}
[15:JAuditCodeFlag]{}
[16:DiagramFlag]{N}
[17:ZoningDiagramStatus]{N}
[18:ZoningDiagramRecDate]{}
[19:DocType]{IF}
[20:FoundationAppDate]{}
[21:Bin]{1054664}
-->
<html>
<head>
<title>Job Overview</title>
<link rel=""stylesheet"" type=""text/css"" href=""bsqpm.css"" media=""screen"">
<link rel=""stylesheet"" type""text/css"" href=""print.css"" media=""print"">
<link rel=""shortcut icon"" href=""/favicon.ico"" type=""image/x-icon"" />
<script language=""javascript"" src=""bis_lib.js""></script>
<script language=""javascript"" src=""sorttable.js""></script>
<script language=""javascript"">
function $(eln)
{
return document.getElementById(eln);
}
</script>
<script language=""javascript"">
<!--
function page(loc, ref)
{
//Commented out the usagelog creation for Caching on 4-30-15
//var ce = (document.cookie == null || document.cookie == """") ? ""n"" : ""y"";
//var u = 'Log/img.gif?m=pg&url='+escape(loc)+'&ref='+escape(ref)+'&ra='+Math.round(Math.random()*100000)+'&ce='+ce;
//(new Image(1,1)).src = u;
}
//onclick=""page('/path/place', document.location);""
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-16591777-1']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
//-->
</script>
</head>
<body bgcolor=""#ffffff"" leftmargin=""0"" topmargin=""0"" marginheight=""0"" marginwidth=""0"" bgproperties=""fixed"">
<center>
<table border=""0"" cellpadding=""3"" cellspacing=""0"" width=""100%""> <!--- Start of Header --->
<tr>
<td class=""header"" align=""center"" colspan=""3"" width=""100%"">
<DIV class=""noprint"">
<table border=""0"" cellpadding=""0"" cellspacing=""0"" width=""100%"">
<tr>
<td class=""header"">
<a href='bsqpm01.jsp'>BIS Menu</a> | Applications
</td>
<td class=""header2"" align=""right""> <a href=""http://www1.nyc.gov/site/buildings/homeowner/homeowner-faqs.page"">FAQs</a> | <a href=""http://www1.nyc.gov/site/buildings/about/acronym-glossary.page"">Glossary</a>
<script language=""javascript"">
<!---
print_today();
//--->
</script>
<script language=""javascript"">
<!--
function page(loc, ref)
{
//Commented out the usagelog creation for Caching on 4-30-15
//var ce = (document.cookie == null || document.cookie == """") ? ""n"" : ""y"";
//var u = 'Log/img.gif?m=pg&url='+escape(loc)+'&ref='+escape(ref)+'&ra='+Math.round(Math.random()*100000)+'&ce='+ce;
//(new Image(1,1)).src = u;
}
//onclick=""page('/path/place', document.location);""
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-16591777-1']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
//-->
</script>
</td>
</tr>
</table>
</DIV>
</td>
</tr>
<tr>
<td colspan=2><a href=""http://www1.nyc.gov/site/buildings/index.page""><img src=""./images/doblogo_1.jpg"" alt=""DOB Logo - Link to Homepage"" border=""0""></a></td> <!--- Cell for DOB Logo --->
<td align=""right""><a href=""http://www1.nyc.gov/""><img src=""./images/nyclogo.gif"" alt=""NYC.gov Logo - Link to Homepage"" border=""0""></a>
<br>
<a href=""https://www.nyc.gov/portal/site/nycgov/menuitem.63099911d804683c09416f1076a09da0/"" onclick=""javascript:page('/ext/signupnews', document.location);"">
<img border=""0"" src=""images/clckhere.gif"" onmouseover=""this.src='images/clckhere_over.gif';"" onmouseout=""this.src='images/clckhere.gif';"">
</a>
</td>
</tr>
<tr>
<td colspan=3 class=""nychdg"" align=""center""><b>NYC Department of Buildings</b></td>
</tr>
<tr>
<td colspan=3 class=""mainhdg"" align=""center"">Job Overview</td>
</tr>
</table> <!--- End of Header --->
<!---Start Message --->
代码:
Sub getAndParse()
Dim bin As String
bin = 1054664
Dim URLOne As String
URLOne = "http://a810-bisweb.nyc.gov/bisweb/JobsQueryByLocationServlet?requestid=1&allbin=" & bin
Dim xmlOne As MSXML2.XMLHTTP60
Set xmlOne = New MSXML2.XMLHTTP60
Dim htmlOne As Object
Set htmlOne = CreateObject("htmlfile")
With xmlOne
.Open "GET", URLOne
.setRequestHeader "Content-Type", "application/xml"
.send
End With
With xmlOne
While Not .readyState = 4
Application.Wait Now + TimeValue("0:00:01")
Wend
If .Status = 200 Then
While InStr(1, .responseText, "Updating", 0) > 0
Application.Wait Now + TimeValue("0:00:01")
Wend
htmlOne.body.innerHTML = .responseText
Else
MsgBox "Connection Unable To Be Made, Try Again"
Exit Sub
End If
End With
Debug.Print xmlOne.getAllResponseHeaders
Stop
[A1] = xmlOne.responseText
For Each nde In htmlOne.body.Children
Debug.Print nde.innerText
Next nde
Dim tblRow As MSHTML.HTMLTableRow
For Each tblRow In tbl.Rows
Debug.Print tblRow.innerText
Next tblRow
End Sub
答案 0 :(得分:1)
并不理想,但是通过简单地复制粘贴就可以避免中间表格式化的困难。非常感谢您可能需要使用“数组”信息的更有条理的方法。
Option Explicit
Public Sub GetInfo()
Dim html As HTMLDocument, hTable As HTMLTable, clipboard As Object, xmlHttp As Object
Set xmlHttp = CreateObject("MSXML2.XMLHTTP.6.0")
Const URL As String = "http://a810-bisweb.nyc.gov/bisweb/JobsQueryByLocationServlet?requestid=1&allbin=1054664"
Application.ScreenUpdating = False
Set html = GetHTMLDoc(URL, xmlHttp)
With html
Set hTable = .getElementsByTagName("table")(5)
Set clipboard = New MSForms.DataObject
clipboard.SetText hTable.outerHTML
clipboard.PutInClipboard
ActiveSheet.Cells(1, 1).PasteSpecial
End With
Application.ScreenUpdating = True
End Sub
Public Function GetHTMLDoc(ByVal URL As String, ByRef xmlHttp As Object) As HTMLDocument
With xmlHttp
.Open "GET", URL, False
.setRequestHeader "Content-Type", "text/xml"
.send
Dim html As HTMLDocument
Set html = New HTMLDocument
html.body.innerHTML = .responseText
End With
Set GetHTMLDoc = html
End Function
参考: