尝试逐行解析csv时“没有光纤运行”

时间:2016-01-28 15:22:38

标签: javascript node.js csv node-fibers

我正在尝试理解如何在fast-csv中使用光纤来制作逐行读取器(单用户命令行脚本),该读取器暂停每行的读取/处理,直到该行完成各种操作异步调用。 (没有滚动我自己的csv代码,我想使用已经找出关于csv格式的陷阱的东西)

如果我这样做

var csv = require("fast-csv");

var CSV_STRING = 'a,b\n' +
'a1,b1\n' +
'a2,b2\n';

csv
.fromString(CSV_STRING, {headers: false})
.on("record", function (data) {
    console.log("line="+JSON.stringify(data));
    setTimeout(function(){
        console.log("timeout");
    },2000);
})
.on("end", function () {
    console.log("done parsing CSV records");
});
console.log("done initializing csv parse");

我得到了我的期望:

done initializing csv parse
line=["a","b"]
line=["a1","b1"]
line=["a2","b2"]
done parsing CSV records
timeout
timeout
timeout

如果我尝试在每条记录后使用光纤产生

Fiber(
    function () {
        var fiber = Fiber.current;

        csv
            .fromString(CSV_STRING, {headers: false})
            .on("record", function (data) {
                console.log("line="+JSON.stringify(data));
                setTimeout(function(){
                    console.log("timeout");
                    fiber.run();
                },2000);
                Fiber.yield();
            })
            .on("end", function () {
                console.log("done parsing CSV records");
            });
        console.log("done initializing csv parse");
    }).run();

我得到了

done initializing csv parse
line=["a","b"]
events.js:141
      throw er; // Unhandled 'error' event
      ^

Error: yield() called with no fiber running

我想我明白发生了什么,Fiber()。run()中的代码完成,所以它在调用yield之前离开光纤,所以当它达到yield时,就不再有光纤了。 (因此聪明的错误信息“没有光纤运行”)

在我完成解析之前,保持光纤运行的适当方法是什么?

似乎这么简单的问题,但我没有看到明显的答案?起初我想在它离开Future()。run()之前把权利放好,但是这不起作用,因为第一个fiber.run()会让它再次离开光纤。

我想要的是流程如下:

done initializing csv parse
line=["a","b"]
timeout
line=["a1","b1"]
timeout
line=["a2","b2"]
timeout
done parsing CSV records

但是如果没有重新编写fast-csv的内部可能是不可能的,因为它控制每个记录触发事件的时间。我目前的想法是,每个事件在fast-csv中被触发,并且让用户在csv.on(“record”)中处理事件,将控制权返回到快速解析csv的循环中。 -csv

2 个答案:

答案 0 :(得分:0)

节点:v5.4.0

嗯,这是获得这种行为的一种方法。我使用es6生成器逐行读取原始文件然后在fast-csv库上使用生成器来逐行读取解析原始字符串,这导致非异步执行流和输出类似旧单用户命令行脚本。

'use strict';
var csv = require("fast-csv");
var sfs = require('./sfs');

function parse(line) {
    csv
        .fromString(line, {headers: false})
        .on("record", function (data) {
            it.next(data);
        });
}

function *main() {
    // Make sure to initialize with a max buffer big enough to span any possible line length.  Otherwise undefined
    var fs = new sfs(it, 4096);
    var result=yield fs.open("data.csv");

    var line;

    while((line=yield fs.readLine()) != null) {
        console.log("line="+line);

        var csvData=yield parse(line);
        console.log("value1="+csvData[0]+" value2="+csvData[1]);
    }

    console.log("DONE");
}

var it = main();
it.next(); // get it all started

除了一个古怪(快速和hacky)类来包装我需要的fs东西。我确信有更好的方法来做我做的事情,但它适合我的需要。

sfs.js

'use strict';
var fs=require('fs')

class sfs {
    constructor(it, maxbufsize) {
        this.MAX_BUF=maxbufsize;
        this.it=it;
        this.fd=null;
        this.lineBuf="";
        this.buffer=new Buffer(this.MAX_BUF);
        this.buflen=0;
    }

    open(file) {
        var parent=this;
        fs.open(file,'r',function(err,fd){
            parent.fd=fd;
            var parent2=parent;
            fs.fstat(fd,function(err, stats){
                parent2.stats=stats;
                parent2.it.next(stats);
            })
        })

    }

    readLine(){
        var parent = this;
        var i=0
        var s=this.stats.size
        var line="";
        var index=this.MAX_BUF-this.buflen;

        // read data into buffer, buffer may already have data from previous read that was shifted left over extracted line
        fs.read(this.fd,this.buffer,this.MAX_BUF-index,index,null,function(err,len,buf){
            var expectedReadLen=parent.MAX_BUF-parent.buflen;
            if(len < expectedReadLen) {  // If we didn't read enough to backfill buffer, lets make sure the string is terminated
                // as it shifts left so we don't try interpret older records to the right
                parent.buffer.fill(' ',parent.buflen+len,parent.MAX_BUF);
            }
            parent.buflen+=len; // whatever was in buffer has more now

            index=parent.buffer.indexOf('\n');

            if(index > -1) {
                line=parent.buffer.toString('utf8',0,index);
                buf.copy(parent.buffer,0,index+1,parent.buflen); // shift unused data left
                parent.buflen-=(index+1); // buffer left over after removing /n terminated line
                if(len<expectedReadLen) {  // If we didn't read enough to backfill buffer, lets make sure we erase old data
                    parent.buffer.fill(' ',parent.buflen,parent.MAX_BUF);
                }
            } else {
                if(parent.buflen > 0) {
                    line=parent.buffer.toString('utf8',0,parent.buflen);
                    parent.buflen=0;
                } else {
                    line=null;
                }
            }
            parent.it.next(line);
        });
    }

    close() {
        fs.close(this.fd);
    }
}

module.exports=sfs;

答案 1 :(得分:0)

Streams是可以使用/可恢复的:

var csv = require("fast-csv");

var CSV_STRING = 'a,b\n' +
    'a1,b1\n' +
    'a2,b2\n';

var stream = csv.fromString(CSV_STRING, { headers: false })
    .on("data", function (data) {
        // pause the stream
        stream.pause();
        console.log("line: " + JSON.stringify(data));
        setTimeout(function () {
            // all async stuff are done, resume the stream
            stream.resume();
            console.log("timeout");
        }, 2000);
    }).on("end", function () {
        console.log("done parsing CSV records");
    });

控制台输出几乎就是您想要的:

/*
line: ["a","b"]
timeout
line: ["a1","b1"]
timeout
line: ["a2","b2"]
done parsing CSV records
timeout
*/

我可以问你为什么绝对需要同步读取你的csv吗?