NodeJS parseStream,定义块的起点和终点

时间:2013-12-16 03:42:37

标签: javascript xml node.js node.js-stream

由Node的文件系统解析混淆。这是我的代码:

var fs = require('fs'),
    xml2js = require('xml2js');

var parser = new xml2js.Parser();

var stream = fs.createReadStream('xml/bigXML.xml');
stream.setEncoding('utf8');

stream.on('data', function(chunk){ 

    parser.parseString(chunk, function (err, result) {
        console.dir(result);
        console.log('Done');
    });
});


stream.on('end', function(chunk){
    // file have been read over,do something...
    console.log("IT'S OVER")
});

这导致......没有任何事情发生。根本没有来自XML2JS /解析器的输出。当我尝试console.log(chunk)时,chunks似乎没有基于除字节大小之外的任何其他任何有意义的块输出。一个'块'的输出是:

<?xml version="1.0" encoding="UTF-8"?>
    <merchandiser xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="merchandiser.xsd">
    <header><merchantId>1237</merchantId><merchantName>NORDSTROM.com</merchantName><createdOn>12/13/2013 23:50:57</createdOn></header>
    <product product_id="52863929">// product info</product>
    <product product_id="26537849">// product info</product>
    <product product_id="25535647">// product info</product>

这个chunk里面有很多很多来自XML的<product>条目。块将在<product>条目的中间某处结束,下一个块将从此处停止。

主要问题是如何让createReadStream<product开始,到</product>结束时输出块?

编辑:为了获得正确的输出,这是从第一个<product>开始到结束的XML:

<?xml version="1.0" encoding="UTF-8" ?>
<merchandiser xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="merchandiser.xsd">
  <header>
    <merchantId>1237</merchantId>
    <merchantName>NORDSTROM.com</merchantName>
    <createdOn>12/13/2013 23:50:57</createdOn>
  </header>
  <product product_id="52863929" name="Teva 'Psyclone' Print Sandal (Baby, Walker &amp; Toddler) Camo/ Dark Olive 6 M" sku_number="52863929" manufacturer_name="Teva" part_number="1001701">
    <category>
      <primary>Toddler Unisex</primary>
      <secondary>Shoes~~Sandals/Slides</secondary>
    </category>
    <URL>
      <product>http://click.linksynergy.com/link?id=LUyP0GcLCGc&amp;offerid=276223.52863929&amp;type=15&amp;murl=http%3A%2F%2Fshop.nordstrom.com%2FS%2F3297406%3Fcm_cat%3Ddatafeed%26cm_pla%3Dshoes%3Asandals%252fslides%26cm_ite%3Dteva_%2527psyclone%2527_print_sandal_%2528baby%252c_walker_%2526_toddler%2529%3A503158_1%26cm_ven%3DLinkshare</product>
      <productImage>http://content.nordstrom.com/imagegallery/store/product/large/0/_6880020.jpg</productImage>
      <buy></buy>
    </URL>
    <description>
      <short>Rugged construction and stylish good looks define a sporty sandal, with the added convenience and security of hook-and-loop closures across the toe and at the instep.Rugged construction and stylish good looks define a sporty sandal, with the added
        convenience and security of h...</short>
      <long>Rugged construction and stylish good looks define a sporty sandal, with the added convenience and security of hook-and-loop closures across the toe and at the instep.Rugged construction and stylish good looks define a sporty sandal, with the added
        convenience and security of hook-and-loop closures across the toe and at the instep. Color(s): camo/ dark olive, daisy blue. Brand: Teva. Style Name: Teva 'Psyclone' Print Sandal (Baby, Walker &amp; Toddler). Style Number: 503158_1.</long>
    </description>
    <discount currency="USD">
      <amount></amount>
      <type>amount</type>
    </discount>
    <price currency="USD">
      <sale begin_date="" end_date="">24.95</sale>
      <retail>24.95</retail>
    </price>
    <brand>Teva</brand>
    <shipping>
      <cost currency="USD">
        <amount>0.00</amount>
        <currency>USD</currency>
      </cost>
      <information></information>
      <availability>Y</availability>
    </shipping>
    <keywords></keywords>
    <upc>737872649135</upc>
    <m1>503158_1.</m1>
    <pixel>http://ad.linksynergy.com/fs-bin/show?id=LUyP0GcLCGc&amp;bids=276223.52863929&amp;type=15&amp;subid=0</pixel>
    <attributeClass class_id="60">
      <Misc></Misc>
      <Product_Type>Shoes</Product_Type>
      <Size>6 M</Size>
      <Material></Material>
      <Color>CAMO/ DARK OLIVE</Color>
      <Gender>Unisex</Gender>
      <Style></Style>
      <Age></Age>
    </attributeClass>
  </product>

2 个答案:

答案 0 :(得分:8)

您有两种方法可以解决您的问题。

如wethat所述,XML2JS在解析数据之前需要完整的XML内容。但是你有一个文件流,它可以通过块来传输数据块。第一个解决方案是将此数据流转换为一个漂亮的大缓冲区,然后将其发送到XML2JS。为此,您可以使用stream-to packagenpm i stream-to)将文件流转换为缓冲区数组,然后使用Buffer.concat将其连接到一个缓冲区,如这样:

var fs = require('fs')
var streamTo = require('stream-to')
var xml2js = require('xml2js')

var file = fs.createReadStream('input.xml')

streamTo.array(file, function (err, arr) {
    if (err) return console.log(err.message)

    var content = Buffer.concat(arr)
    var parser = new xml2js.Parser()
    parser.parseString(content, function (err, res) {
        if (err) return console.log(err.message)
        console.log(res.merchandiser.product)
    })
})

这很好用,但由于它需要将整个文件保存到内存中,如果输入文件非常大,它将无法工作。要处理非常大的文件,您需要使用流式XML解析器,例如sax。但是sax不会创建Javascript对象,但它是一个EventEmitter,并且使用起来有点困难,因为您必须处理所有相关事件才能动态构建对象。

您可以使用SaXPath library,它支持XPath语法的一小部分。每次匹配XPath模式时,此库都会发出match事件。这是一个例子:

var saxpath = require('saxpath')
var fs = require('fs')
var sax = require('sax')

var saxParser = sax.createStream(true)
var streamer = new saxpath.SaXPath(saxParser, '/merchandiser/product')

streamer.on('match', function(xml) {
    console.log(xml);
});

fs.createReadStream('input.xml').pipe(saxParser)

您有两个选择:

  1. 由于您现在拥有一次只匹配一个产品的XML,因此您可以使用xml2js一次解析一个产品
  2. SaXPath支持多个录像机:默认录像机监听sax事件并重新创建相应的XML(允许我们使用第一个解决方案),但您可以推出自己的录像机,监听sax事件和创建即时的javascript对象。

答案 1 :(得分:0)

xml2js用于满载xml。

在使用sax的情况下,它是一个流解析器:

//安装

npm install sax

//此代码用于打印所有product_id

var fs = require('fs');
var sax = require('sax');

var saxStream = sax.createStream();

saxStream.onopentag = function (node) {
    if(node.name === 'PRODUCT'){
        console.log(node.attributes.PRODUCT_ID);
    }
};

fs.createReadStream('xml/bigXML.xml').pipe(saxStream);

输出中:

52863929
26537849
25535647