使用parse.com的云代码,我正试图从网页上抓取数据以发送到我的iOS应用程序。我已经在iOS中本地实现了Web抓取代码,但我正在尝试将此任务移至后端。我正在使用名为xpath.js
的node.js库Parse.Cloud.define("test", function(request, response) {
Parse.Cloud.httpRequest({
url: "http://menu.ha.ucla.edu/foodpro/default.asp",
success: function(httpResponse) {
var text = httpResponse.text;
var xpath = require("cloud/xpath.js"), dom = require("cloud/dom-parser.js").DOMParser;
var doc = new dom().parseFromString(text);
var cells = xpath.select("//td[starts-with(@class, 'menugridcell')]", doc);
response.success("test " + cells.count);
var listNode = xpath.select("//ul", cells[0])[0];
},
error: function(httpResponse) {
console.error('Request failed with response code ' + httpResponse.status);
}
});
});
但是,当我运行代码时,我收到此错误:
"Uncaught end tag name: div is not match the current start tagName:script"
就像我之前提到的,我已经能够使用单独的objective-c库成功地抓取Web数据,因此标记是一致的,并且问题不在于源代码中。
对于源代码,这里是webpage I'm scraping。 StackOverflow不允许我直接链接到源代码,否则我会直接链接。
编辑:
这是dom-parser.js中的代码
function DOMParser(options){
this.options = options ||{locator:{}};
}
DOMParser.prototype.parseFromString = function(source,mimeType){
var options = this.options;
var sax = new XMLReader();
var domBuilder = options.domBuilder || new DOMHandler();//contentHandler and LexicalHandler
var errorHandler = options.errorHandler;
var locator = options.locator;
var defaultNSMap = options.xmlns||{};
var entityMap = {'lt':'<','gt':'>','amp':'&','quot':'"','apos':"'"}
if(locator){
domBuilder.setDocumentLocator(locator)
}
sax.errorHandler = buildErrorHandler(errorHandler,domBuilder,locator);
sax.domBuilder = options.domBuilder || domBuilder;
if(/\/x?html?$/.test(mimeType)){
entityMap.nbsp = '\xa0';
entityMap.copy = '\xa9';
defaultNSMap['']= 'http://www.w3.org/1999/xhtml';
}
if(source){
sax.parse(source,defaultNSMap,entityMap);
}else{
sax.errorHandler.error("invalid document source");
}
return domBuilder.document;
}
function buildErrorHandler(errorImpl,domBuilder,locator){
if(!errorImpl){
if(domBuilder instanceof DOMHandler){
return domBuilder;
}
errorImpl = domBuilder ;
}
var errorHandler = {}
var isCallback = errorImpl instanceof Function;
locator = locator||{}
function build(key){
var fn = errorImpl[key];
if(!fn){
if(isCallback){
fn = errorImpl.length == 2?function(msg){errorImpl(key,msg)}:errorImpl;
}else{
var i=arguments.length;
while(--i){
if(fn = errorImpl[arguments[i]]){
break;
}
}
}
}
errorHandler[key] = fn && function(msg){
fn(msg+_locator(locator));
}||function(){};
}
build('warning','warn');
build('error','warn','warning');
build('fatalError','warn','warning','error');
return errorHandler;
}
/**
* +ContentHandler+ErrorHandler
* +LexicalHandler+EntityResolver2
* -DeclHandler-DTDHandler
*
* DefaultHandler:EntityResolver, DTDHandler, ContentHandler, ErrorHandler
* DefaultHandler2:DefaultHandler,LexicalHandler, DeclHandler, EntityResolver2
* @link http://www.saxproject.org/apidoc/org/xml/sax/helpers/DefaultHandler.html
*/
function DOMHandler() {
this.cdata = false;
}
function position(locator,node){
node.lineNumber = locator.lineNumber;
node.columnNumber = locator.columnNumber;
}
/**
* @see org.xml.sax.ContentHandler#startDocument
* @link http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html
*/
DOMHandler.prototype = {
startDocument : function() {
this.document = new DOMImplementation().createDocument(null, null, null);
if (this.locator) {
this.document.documentURI = this.locator.systemId;
}
},
startElement:function(namespaceURI, localName, qName, attrs) {
var doc = this.document;
var el = doc.createElementNS(namespaceURI, qName||localName);
var len = attrs.length;
appendElement(this, el);
this.currentElement = el;
this.locator && position(this.locator,el)
for (var i = 0 ; i < len; i++) {
var namespaceURI = attrs.getURI(i);
var value = attrs.getValue(i);
var qName = attrs.getQName(i);
var attr = doc.createAttributeNS(namespaceURI, qName);
if( attr.getOffset){
position(attr.getOffset(1),attr)
}
attr.value = attr.nodeValue = value;
el.setAttributeNode(attr)
}
},
endElement:function(namespaceURI, localName, qName) {
var current = this.currentElement
var tagName = current.tagName;
this.currentElement = current.parentNode;
},
startPrefixMapping:function(prefix, uri) {
},
endPrefixMapping:function(prefix) {
},
processingInstruction:function(target, data) {
var ins = this.document.createProcessingInstruction(target, data);
this.locator && position(this.locator,ins)
appendElement(this, ins);
},
ignorableWhitespace:function(ch, start, length) {
},
characters:function(chars, start, length) {
chars = _toString.apply(this,arguments)
//console.log(chars)
if(this.currentElement && chars){
if (this.cdata) {
var charNode = this.document.createCDATASection(chars);
this.currentElement.appendChild(charNode);
} else {
var charNode = this.document.createTextNode(chars);
this.currentElement.appendChild(charNode);
}
this.locator && position(this.locator,charNode)
}
},
skippedEntity:function(name) {
},
endDocument:function() {
this.document.normalize();
},
setDocumentLocator:function (locator) {
if(this.locator = locator){// && !('lineNumber' in locator)){
locator.lineNumber = 0;
}
},
//LexicalHandler
comment:function(chars, start, length) {
chars = _toString.apply(this,arguments)
var comm = this.document.createComment(chars);
this.locator && position(this.locator,comm)
appendElement(this, comm);
},
startCDATA:function() {
//used in characters() methods
this.cdata = true;
},
endCDATA:function() {
this.cdata = false;
},
startDTD:function(name, publicId, systemId) {
var impl = this.document.implementation;
if (impl && impl.createDocumentType) {
var dt = impl.createDocumentType(name, publicId, systemId);
this.locator && position(this.locator,dt)
appendElement(this, dt);
}
},
/**
* @see org.xml.sax.ErrorHandler
* @link http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html
*/
warning:function(error) {
console.warn(error,_locator(this.locator));
},
error:function(error) {
console.error(error,_locator(this.locator));
},
fatalError:function(error) {
console.error(error,_locator(this.locator));
throw error;
}
}
function _locator(l){
if(l){
return '\n@'+(l.systemId ||'')+'#[line:'+l.lineNumber+',col:'+l.columnNumber+']'
}
}
function _toString(chars,start,length){
if(typeof chars == 'string'){
return chars.substr(start,length)
}else{//java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)")
if(chars.length >= start+length || start){
return new java.lang.String(chars,start,length)+'';
}
return chars;
}
}
/*
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html
* used method of org.xml.sax.ext.LexicalHandler:
* #comment(chars, start, length)
* #startCDATA()
* #endCDATA()
* #startDTD(name, publicId, systemId)
*
*
* IGNORED method of org.xml.sax.ext.LexicalHandler:
* #endDTD()
* #startEntity(name)
* #endEntity(name)
*
*
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html
* IGNORED method of org.xml.sax.ext.DeclHandler
* #attributeDecl(eName, aName, type, mode, value)
* #elementDecl(name, model)
* #externalEntityDecl(name, publicId, systemId)
* #internalEntityDecl(name, value)
* @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html
* IGNORED method of org.xml.sax.EntityResolver2
* #resolveEntity(String name,String publicId,String baseURI,String systemId)
* #resolveEntity(publicId, systemId)
* #getExternalSubset(name, baseURI)
* @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html
* IGNORED method of org.xml.sax.DTDHandler
* #notationDecl(name, publicId, systemId) {};
* #unparsedEntityDecl(name, publicId, systemId, notationName) {};
*/
"endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl".replace(/\w+/g,function(key){
DOMHandler.prototype[key] = function(){return null}
})
/* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */
function appendElement (hander,node) {
if (!hander.currentElement) {
hander.document.appendChild(node);
} else {
hander.currentElement.appendChild(node);
}
}//appendChild and setAttributeNS are preformance key
if(typeof require == 'function'){
var XMLReader = require('cloud/sax').XMLReader;
var DOMImplementation = exports.DOMImplementation = require('cloud/dom').DOMImplementation;
exports.XMLSerializer = require('cloud/dom').XMLSerializer ;
exports.DOMParser = DOMParser;
}
答案 0 :(得分:0)
给定页面在html脚本中包含一些XML标记。可以忽略开始标记,因为它们包含excaped qoutation标记。解析器找到</div>
(在脚本中的字符串中)并尝试将其与开头<script>
匹配并失败。解析器尝试读取XML并且不知道xhtml脚本区域是CData。
你必须告诉解析忽略(或读作CData)脚本标记。抱歉,但我不知道该怎么做。
最好的问候Majo