第三方XML解析器(xpath.js)给出错误"未捕获的结束标记名称:div与当前的开始tagName不匹配"

时间:2014-09-27 11:59:32

标签: javascript xml node.js xpath parse-platform

使用parse.com的云代码,我正试图从网页上抓取数据以发送到我的iOS应用程序。我已经在iOS中本地实现了Web抓取代码,但我正在尝试将此任务移至后端。我正在使用名为xpath.js

的node.js库
Parse.Cloud.define("test", function(request, response) {   


        Parse.Cloud.httpRequest({
      url: "http://menu.ha.ucla.edu/foodpro/default.asp",
      success: function(httpResponse) {
        var text = httpResponse.text;
        var xpath = require("cloud/xpath.js"), dom = require("cloud/dom-parser.js").DOMParser;
        var doc = new dom().parseFromString(text);
        var cells = xpath.select("//td[starts-with(@class, 'menugridcell')]", doc);

        response.success("test " + cells.count);
        var listNode = xpath.select("//ul", cells[0])[0]; 
         },
     error: function(httpResponse) {
        console.error('Request failed with response code ' + httpResponse.status);
      }
});
}); 

但是,当我运行代码时,我收到此错误:

"Uncaught end tag name: div is not match the current start tagName:script"

就像我之前提到的,我已经能够使用单独的objective-c库成功地抓取Web数据,因此标记是一致的,并且问题不在于源代码中。

对于源代码,这里是webpage I'm scraping。 StackOverflow不允许我直接链接到源代码,否则我会直接链接。

编辑:

这是dom-parser.js中的代码

function DOMParser(options){
    this.options = options ||{locator:{}};

}
DOMParser.prototype.parseFromString = function(source,mimeType){    
    var options = this.options;
    var sax =  new XMLReader();
    var domBuilder = options.domBuilder || new DOMHandler();//contentHandler and LexicalHandler
    var errorHandler = options.errorHandler;
    var locator = options.locator;
    var defaultNSMap = options.xmlns||{};
    var entityMap = {'lt':'<','gt':'>','amp':'&','quot':'"','apos':"'"}
    if(locator){
        domBuilder.setDocumentLocator(locator)
    }

    sax.errorHandler = buildErrorHandler(errorHandler,domBuilder,locator);
    sax.domBuilder = options.domBuilder || domBuilder;
    if(/\/x?html?$/.test(mimeType)){
        entityMap.nbsp = '\xa0';
        entityMap.copy = '\xa9';
        defaultNSMap['']= 'http://www.w3.org/1999/xhtml';
    }
    if(source){
        sax.parse(source,defaultNSMap,entityMap);
    }else{
        sax.errorHandler.error("invalid document source");
    }
    return domBuilder.document;
}
function buildErrorHandler(errorImpl,domBuilder,locator){
    if(!errorImpl){
        if(domBuilder instanceof DOMHandler){
            return domBuilder;
        }
        errorImpl = domBuilder ;
    }
    var errorHandler = {}
    var isCallback = errorImpl instanceof Function;
    locator = locator||{}
    function build(key){
        var fn = errorImpl[key];
        if(!fn){
            if(isCallback){
                fn = errorImpl.length == 2?function(msg){errorImpl(key,msg)}:errorImpl;
            }else{
                var i=arguments.length;
                while(--i){
                    if(fn = errorImpl[arguments[i]]){
                        break;
                    }
                }
            }
        }
        errorHandler[key] = fn && function(msg){
            fn(msg+_locator(locator));
        }||function(){};
    }
    build('warning','warn');
    build('error','warn','warning');
    build('fatalError','warn','warning','error');
    return errorHandler;
}
/**
 * +ContentHandler+ErrorHandler
 * +LexicalHandler+EntityResolver2
 * -DeclHandler-DTDHandler 
 * 
 * DefaultHandler:EntityResolver, DTDHandler, ContentHandler, ErrorHandler
 * DefaultHandler2:DefaultHandler,LexicalHandler, DeclHandler, EntityResolver2
 * @link http://www.saxproject.org/apidoc/org/xml/sax/helpers/DefaultHandler.html
 */
function DOMHandler() {
    this.cdata = false;
}
function position(locator,node){
    node.lineNumber = locator.lineNumber;
    node.columnNumber = locator.columnNumber;
}
/**
 * @see org.xml.sax.ContentHandler#startDocument
 * @link http://www.saxproject.org/apidoc/org/xml/sax/ContentHandler.html
 */ 
DOMHandler.prototype = {
    startDocument : function() {
        this.document = new DOMImplementation().createDocument(null, null, null);
        if (this.locator) {
            this.document.documentURI = this.locator.systemId;
        }
    },
    startElement:function(namespaceURI, localName, qName, attrs) {
        var doc = this.document;
        var el = doc.createElementNS(namespaceURI, qName||localName);
        var len = attrs.length;
        appendElement(this, el);
        this.currentElement = el;

        this.locator && position(this.locator,el)
        for (var i = 0 ; i < len; i++) {
            var namespaceURI = attrs.getURI(i);
            var value = attrs.getValue(i);
            var qName = attrs.getQName(i);
            var attr = doc.createAttributeNS(namespaceURI, qName);
            if( attr.getOffset){
                position(attr.getOffset(1),attr)
            }
            attr.value = attr.nodeValue = value;
            el.setAttributeNode(attr)
        }
    },
    endElement:function(namespaceURI, localName, qName) {
        var current = this.currentElement
        var tagName = current.tagName;
        this.currentElement = current.parentNode;
    },
    startPrefixMapping:function(prefix, uri) {
    },
    endPrefixMapping:function(prefix) {
    },
    processingInstruction:function(target, data) {
        var ins = this.document.createProcessingInstruction(target, data);
        this.locator && position(this.locator,ins)
        appendElement(this, ins);
    },
    ignorableWhitespace:function(ch, start, length) {
    },
    characters:function(chars, start, length) {
        chars = _toString.apply(this,arguments)
        //console.log(chars)
        if(this.currentElement && chars){
            if (this.cdata) {
                var charNode = this.document.createCDATASection(chars);
                this.currentElement.appendChild(charNode);
            } else {
                var charNode = this.document.createTextNode(chars);
                this.currentElement.appendChild(charNode);
            }
            this.locator && position(this.locator,charNode)
        }
    },
    skippedEntity:function(name) {
    },
    endDocument:function() {
        this.document.normalize();
    },
    setDocumentLocator:function (locator) {
        if(this.locator = locator){// && !('lineNumber' in locator)){
            locator.lineNumber = 0;
        }
    },
    //LexicalHandler
    comment:function(chars, start, length) {
        chars = _toString.apply(this,arguments)
        var comm = this.document.createComment(chars);
        this.locator && position(this.locator,comm)
        appendElement(this, comm);
    },

    startCDATA:function() {
        //used in characters() methods
        this.cdata = true;
    },
    endCDATA:function() {
        this.cdata = false;
    },

    startDTD:function(name, publicId, systemId) {
        var impl = this.document.implementation;
        if (impl && impl.createDocumentType) {
            var dt = impl.createDocumentType(name, publicId, systemId);
            this.locator && position(this.locator,dt)
            appendElement(this, dt);
        }
    },
    /**
     * @see org.xml.sax.ErrorHandler
     * @link http://www.saxproject.org/apidoc/org/xml/sax/ErrorHandler.html
     */
    warning:function(error) {
        console.warn(error,_locator(this.locator));
    },
    error:function(error) {
        console.error(error,_locator(this.locator));
    },
    fatalError:function(error) {
        console.error(error,_locator(this.locator));
        throw error;
    }
}
function _locator(l){
    if(l){
        return '\n@'+(l.systemId ||'')+'#[line:'+l.lineNumber+',col:'+l.columnNumber+']'
    }
}
function _toString(chars,start,length){
    if(typeof chars == 'string'){
        return chars.substr(start,length)
    }else{//java sax connect width xmldom on rhino(what about: "? && !(chars instanceof String)")
        if(chars.length >= start+length || start){
            return new java.lang.String(chars,start,length)+'';
        }
        return chars;
    }
}

/*
 * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/LexicalHandler.html
 * used method of org.xml.sax.ext.LexicalHandler:
 *  #comment(chars, start, length)
 *  #startCDATA()
 *  #endCDATA()
 *  #startDTD(name, publicId, systemId)
 *
 *
 * IGNORED method of org.xml.sax.ext.LexicalHandler:
 *  #endDTD()
 *  #startEntity(name)
 *  #endEntity(name)
 *
 *
 * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/DeclHandler.html
 * IGNORED method of org.xml.sax.ext.DeclHandler
 *  #attributeDecl(eName, aName, type, mode, value)
 *  #elementDecl(name, model)
 *  #externalEntityDecl(name, publicId, systemId)
 *  #internalEntityDecl(name, value)
 * @link http://www.saxproject.org/apidoc/org/xml/sax/ext/EntityResolver2.html
 * IGNORED method of org.xml.sax.EntityResolver2
 *  #resolveEntity(String name,String publicId,String baseURI,String systemId)
 *  #resolveEntity(publicId, systemId)
 *  #getExternalSubset(name, baseURI)
 * @link http://www.saxproject.org/apidoc/org/xml/sax/DTDHandler.html
 * IGNORED method of org.xml.sax.DTDHandler
 *  #notationDecl(name, publicId, systemId) {};
 *  #unparsedEntityDecl(name, publicId, systemId, notationName) {};
 */
"endDTD,startEntity,endEntity,attributeDecl,elementDecl,externalEntityDecl,internalEntityDecl,resolveEntity,getExternalSubset,notationDecl,unparsedEntityDecl".replace(/\w+/g,function(key){
    DOMHandler.prototype[key] = function(){return null}
})

/* Private static helpers treated below as private instance methods, so don't need to add these to the public API; we might use a Relator to also get rid of non-standard public properties */
function appendElement (hander,node) {
    if (!hander.currentElement) {
        hander.document.appendChild(node);
    } else {
        hander.currentElement.appendChild(node);
    }
}//appendChild and setAttributeNS are preformance key

if(typeof require == 'function'){
    var XMLReader = require('cloud/sax').XMLReader;
    var DOMImplementation = exports.DOMImplementation = require('cloud/dom').DOMImplementation;
    exports.XMLSerializer = require('cloud/dom').XMLSerializer ;
    exports.DOMParser = DOMParser;
}

1 个答案:

答案 0 :(得分:0)

给定页面在html脚本中包含一些XML标记。可以忽略开始标记,因为它们包含excaped qoutation标记。解析器找到</div>(在脚本中的字符串中)并尝试将其与开头<script>匹配并失败。解析器尝试读取XML并且不知道xhtml脚本区域是CData。

你必须告诉解析忽略(或读作CData)脚本标记。抱歉,但我不知道该怎么做。

最好的问候Majo