Reading a large file with stream and inserting to DB NodeJS

时间:2016-09-06 09:26:56

标签: sql-server node.js

I am reading a large file (1mil+ records with many attributes) using NodeJS and module called line-by-line.

The reading is in a stream, to avoid an out of memory error. When I reach 50000 records I want to insert them to the DB. I am using node-mssql module for that.

As I am quite new to this asynchronous programming I am having troubles understanding what is the issue that results in some batches not being imported to DB. Usually 1-4.

Reading file code:

function readAndInsertToDb(fileName, callback){
    lr = new LineByLineReader(fileName);

    lr.on('error', function (err) {
        callback("Error while reading file:" + err);
    });
    var contacts = [];
    var batchNr = 0;
    var lineNr = 0;
    lr.on('line', function (line) {
        // 'line' contains the current line without the trailing newline character.
        if (lineNr>0){
            //util.log("Line number: "+lineNr);
            contacts.push(toObject(line.split(";")));
            if ((contacts.length%50000)==0){
                batchNr++; 
                util.log("Inserting file data to DB. Batch #"+batchNr);
                dbHandler.insertToDb("DSContact", contacts, 0, callback);
                contacts = []; 
                util.log("Array emptied after insertion to DB.");
            }
        }
        lineNr++;
    });

    lr.on('end', function () {
        batchNr++; 
        // All lines are read, file is closed now.
        util.log("Inserting file data to DB. Last batch #"+batchNr);
        var lastBatch = true;
        dbHandler.insertToDb("DSContact", contacts, 0, callback, lastBatch);
        return;
    });
}

DB insertion:

dbHandler.prototype.insertToDb = function (tableName, contactData, callback, lastBatch, batchNr){
    var self = this;
    util.log("Going to insert data to DB: "+contactData.length);
    sql.connect(this.mssqlConfig).then(function() {
        // DO the actual bulk insert
        var table = new sql.Table(tableName); 
        table.create = false;
        table.columns.add('EmailAddress', sql.NVarChar(255), {nullable: false, primary: true});
        table.columns.add('DataSourceId', sql.NVarChar(50), {nullable: true});
        util.log("Array size: "+contactData.length);
        var counter = 0;
        for (var i=0; i<contactData.length; i++){
            table.rows.add(contactData[i].EmailAddress, contactData[i].DataSourceId);
            counter++;
        }

        var request = new sql.Request();
        util.log("Making SQL request for batch: #"+batchNr);
        request.bulk(table, function(err, rowCount) {
            if (err){
                sql.close();
                return callback("Error during executing a query: " + err);
            }
            util.log("Insertion of a batch done. #"+batchNr);
            sql.close();
            if (lastBatch){
                callback(null);
            }
            return;
        });
    }).catch(function(err) {
            util.log("Error during DB connection: "+err);
            sql.close();
    });
}

Output log (you may see that some batches - #5,#18... have not written log saying insertion was done):

6 Sep 11:25:22 - Going to blank out table: DSContact
6 Sep 11:25:23 - Deleting from table: DSContact
6 Sep 11:25:36 - Inserting file data to DB. Batch #1
6 Sep 11:25:36 - Going to insert data to DB: 50000
6 Sep 11:25:36 - Array emptied after insertion to DB.
6 Sep 11:25:36 - Array size: 50000
6 Sep 11:25:36 - Making SQL request for batch: #1
6 Sep 11:25:38 - Insertion of a batch done. #1
6 Sep 11:25:38 - Inserting file data to DB. Batch #2
6 Sep 11:25:38 - Going to insert data to DB: 50000
6 Sep 11:25:38 - Array emptied after insertion to DB.
6 Sep 11:25:39 - Array size: 50000
6 Sep 11:25:39 - Making SQL request for batch: #2
6 Sep 11:25:40 - Inserting file data to DB. Batch #3
6 Sep 11:25:40 - Going to insert data to DB: 50000
6 Sep 11:25:40 - Array emptied after insertion to DB.
6 Sep 11:25:40 - Array size: 50000
6 Sep 11:25:40 - Making SQL request for batch: #3
6 Sep 11:25:41 - Insertion of a batch done. #2
6 Sep 11:25:42 - Inserting file data to DB. Batch #4
6 Sep 11:25:42 - Going to insert data to DB: 50000
6 Sep 11:25:42 - Array emptied after insertion to DB.
6 Sep 11:25:42 - Array size: 50000
6 Sep 11:25:42 - Making SQL request for batch: #4
6 Sep 11:25:42 - Insertion of a batch done. #3
6 Sep 11:25:44 - Inserting file data to DB. Batch #5
6 Sep 11:25:44 - Going to insert data to DB: 50000
6 Sep 11:25:44 - Array emptied after insertion to DB.
6 Sep 11:25:44 - Insertion of a batch done. #4
6 Sep 11:25:45 - Inserting file data to DB. Batch #6
6 Sep 11:25:45 - Going to insert data to DB: 50000
6 Sep 11:25:45 - Array emptied after insertion to DB.
6 Sep 11:25:45 - Array size: 50000
6 Sep 11:25:45 - Making SQL request for batch: #6
6 Sep 11:25:46 - Insertion of a batch done. #6
6 Sep 11:25:47 - Inserting file data to DB. Batch #7
6 Sep 11:25:47 - Going to insert data to DB: 50000
6 Sep 11:25:47 - Array emptied after insertion to DB.
6 Sep 11:25:47 - Array size: 50000
6 Sep 11:25:47 - Making SQL request for batch: #7
6 Sep 11:25:49 - Insertion of a batch done. #7
6 Sep 11:25:49 - Inserting file data to DB. Batch #8
6 Sep 11:25:49 - Going to insert data to DB: 50000
6 Sep 11:25:49 - Array emptied after insertion to DB.
6 Sep 11:25:49 - Array size: 50000
6 Sep 11:25:49 - Making SQL request for batch: #8
6 Sep 11:25:51 - Insertion of a batch done. #8
6 Sep 11:25:51 - Inserting file data to DB. Batch #9
6 Sep 11:25:51 - Going to insert data to DB: 50000
6 Sep 11:25:51 - Array emptied after insertion to DB.
6 Sep 11:25:51 - Array size: 50000
6 Sep 11:25:51 - Making SQL request for batch: #9
6 Sep 11:25:52 - Insertion of a batch done. #9
6 Sep 11:25:53 - Inserting file data to DB. Batch #10
6 Sep 11:25:53 - Going to insert data to DB: 50000
6 Sep 11:25:53 - Array emptied after insertion to DB.
6 Sep 11:25:53 - Array size: 50000
6 Sep 11:25:53 - Making SQL request for batch: #10
6 Sep 11:25:54 - Insertion of a batch done. #10
6 Sep 11:25:54 - Inserting file data to DB. Batch #11
6 Sep 11:25:54 - Going to insert data to DB: 50000
6 Sep 11:25:54 - Array emptied after insertion to DB.
6 Sep 11:25:54 - Array size: 50000
6 Sep 11:25:55 - Making SQL request for batch: #11
6 Sep 11:25:56 - Insertion of a batch done. #11
6 Sep 11:25:56 - Inserting file data to DB. Batch #12
6 Sep 11:25:56 - Going to insert data to DB: 50000
6 Sep 11:25:56 - Array emptied after insertion to DB.
6 Sep 11:25:56 - Array size: 50000
6 Sep 11:25:56 - Making SQL request for batch: #12
6 Sep 11:25:58 - Inserting file data to DB. Batch #13
6 Sep 11:25:58 - Going to insert data to DB: 50000
6 Sep 11:25:58 - Array emptied after insertion to DB.
6 Sep 11:25:58 - Array size: 50000
6 Sep 11:25:58 - Making SQL request for batch: #13
6 Sep 11:25:58 - Insertion of a batch done. #12
6 Sep 11:26:00 - Insertion of a batch done. #13
6 Sep 11:26:00 - Inserting file data to DB. Batch #14
6 Sep 11:26:00 - Going to insert data to DB: 50000
6 Sep 11:26:00 - Array emptied after insertion to DB.
6 Sep 11:26:00 - Array size: 50000
6 Sep 11:26:00 - Making SQL request for batch: #14
6 Sep 11:26:01 - Insertion of a batch done. #14
6 Sep 11:26:02 - Inserting file data to DB. Batch #15
6 Sep 11:26:02 - Going to insert data to DB: 50000
6 Sep 11:26:02 - Array emptied after insertion to DB.
6 Sep 11:26:02 - Array size: 50000
6 Sep 11:26:02 - Making SQL request for batch: #15
6 Sep 11:26:04 - Inserting file data to DB. Batch #16
6 Sep 11:26:04 - Going to insert data to DB: 50000
6 Sep 11:26:04 - Array emptied after insertion to DB.
6 Sep 11:26:04 - Array size: 50000
6 Sep 11:26:04 - Making SQL request for batch: #16
6 Sep 11:26:04 - Insertion of a batch done. #15
6 Sep 11:26:05 - Insertion of a batch done. #16
6 Sep 11:26:05 - Inserting file data to DB. Batch #17
6 Sep 11:26:05 - Going to insert data to DB: 50000
6 Sep 11:26:05 - Array emptied after insertion to DB.
6 Sep 11:26:06 - Array size: 50000
6 Sep 11:26:06 - Making SQL request for batch: #17
6 Sep 11:26:07 - Inserting file data to DB. Batch #18
6 Sep 11:26:07 - Going to insert data to DB: 50000
6 Sep 11:26:07 - Array emptied after insertion to DB.
6 Sep 11:26:07 - Insertion of a batch done. #17
6 Sep 11:26:08 - Inserting file data to DB. Batch #19
6 Sep 11:26:08 - Going to insert data to DB: 50000
6 Sep 11:26:08 - Array emptied after insertion to DB.
6 Sep 11:26:08 - Array size: 50000
6 Sep 11:26:09 - Making SQL request for batch: #19
6 Sep 11:26:10 - Insertion of a batch done. #19
6 Sep 11:26:10 - Inserting file data to DB. Batch #20
6 Sep 11:26:10 - Going to insert data to DB: 50000
6 Sep 11:26:10 - Array emptied after insertion to DB.
6 Sep 11:26:10 - Array size: 50000
6 Sep 11:26:10 - Making SQL request for batch: #20
6 Sep 11:26:11 - Insertion of a batch done. #20
6 Sep 11:26:11 - Inserting file data to DB. Batch #21
6 Sep 11:26:11 - Going to insert data to DB: 50000
6 Sep 11:26:11 - Array emptied after insertion to DB.
6 Sep 11:26:12 - Array size: 50000
6 Sep 11:26:12 - Making SQL request for batch: #21
6 Sep 11:26:13 - Inserting file data to DB. Batch #22
6 Sep 11:26:13 - Going to insert data to DB: 50000
6 Sep 11:26:13 - Array emptied after insertion to DB.
6 Sep 11:26:13 - Array size: 50000
6 Sep 11:26:13 - Making SQL request for batch: #22
6 Sep 11:26:14 - Insertion of a batch done. #21
6 Sep 11:26:14 - Insertion of a batch done. #22
6 Sep 11:26:15 - Inserting file data to DB. Batch #23
6 Sep 11:26:15 - Going to insert data to DB: 50000
6 Sep 11:26:15 - Array emptied after insertion to DB.
6 Sep 11:26:15 - Array size: 50000
6 Sep 11:26:15 - Making SQL request for batch: #23
6 Sep 11:26:16 - Inserting file data to DB. Last batch #24
6 Sep 11:26:16 - Going to insert data to DB: 37919
6 Sep 11:26:16 - Insertion of a batch done. #23

1 个答案:

答案 0 :(得分:0)

我有点想通了自己,解决方案是让它更顺序。虽然我仍然想知道为什么我之前的解决方案没有奏效。

我在line-by-line documentation中找到了这个:

  

行的异步处理:

var LineByLineReader = require('line-by-line'),
    lr = new LineByLineReader('big_file.txt');

lr.on('error', function (err) {
   // 'err' contains error object
});

lr.on('line', function (line) {
   // pause emitting of lines...
   lr.pause();

   // ...do your asynchronous line processing..
   setTimeout(function () {

       // ...and continue emitting lines.
      lr.resume();
   }, 100);
});

lr.on('end', function () {
   // All lines are read, file is closed now.
});

所以我的新解决方案看起来像这样:

阅读文件代码:

function readAndInsertToDb(fileName, callback){
    lr = new LineByLineReader(fileName);

    lr.on('error', function (err) {
        callback("Error while reading file:" + err);
    });
    var contacts = [];
    var batchNr = 0;
    var lineNr = 0;
    lr.on('line', function (line) {


        // 'line' contains the current line without the trailing newline character.
        //if (lineNr>0){
            //util.log("Line number: "+lineNr);
            contacts.push(toObject(line.split(";")));
            if ((contacts.length%50000)==0){
                lr.pause();


                batchNr++; 
                util.log("Inserting file data to DB. Batch #"+batchNr);
                dbHandler.insertToDb("DSContact", contacts, callback, false, function(){lr.resume();});
                contacts = []; 
                util.log("Array emptied after insertion to DB.");
            }
        //}
        lineNr++;
    });

    lr.on('end', function () {
        batchNr++; 
        // All lines are read, file is closed now.
        util.log("Inserting file data to DB. Last batch #"+batchNr);
        var lastBatch = true;
        dbHandler.insertToDb("DSContact", contacts, callback, lastBatch, batchNr);
        return;
    });
}

数据库插入代码:

dbHandler.prototype.insertToDb = function (tableName, contactData, callback, lastBatch, innerCallback){
    var self = this;
    util.log("Going to insert data to DB: "+contactData.length);
    sql.connect(this.mssqlConfig).then(function() {
        // DO the actual bulk insert
        var table = new sql.Table(tableName); 
        table.create = false;
        table.columns.add('EmailAddress', sql.NVarChar(255), {nullable: false, primary: true});
        table.columns.add('DataSourceId', sql.NVarChar(50), {nullable: true});
        util.log("Array size: "+contactData.length);
        var counter = 0;
        for (var i=0; i<contactData.length; i++){
            table.rows.add(contactData[i].EmailAddress, contactData[i].DataSourceId);
            counter++;
        }

        var request = new sql.Request();
        util.log("Making SQL request.");
        request.bulk(table, function(err, rowCount) {
            if (err){
                sql.close();
                return callback("Error during executing a query: " + err);
            }
            util.log("Insertion of a batch done.");
            sql.close();
            if (lastBatch){
                return callback(null);
            }
            return innerCallback();
        });
    }).catch(function(err) {
            util.log("Error during DB connection: "+err);
            sql.close();
    });
}