我们读取了一个包含大约500k元素的XML文件(使用xml-stream
)并将它们插入到MongoDB中,如下所示:
xml.on(`endElement: product`, writeDataToDb.bind(this, "product"));
writeDataToDb(type, obj)
中的插入内容如下:
collection.insertOne(obj, {w: 1, wtimeout: 15000}).catch((e) => { });
现在,当Mongo连接断开连接时,xml流仍然会读取并且控制台充满了错误消息(无法插入,断开连接,EPIPE损坏,......)。
在docs中说:
当你关闭mongod进程时,驱动程序会停止处理操作并继续缓冲它们,因为bufferMaxEntries默认为-1表示缓冲所有操作。
这个缓冲区实际上做了什么?
我们注意到当我们插入数据并关闭mongo服务器时,事情得到缓冲,然后我们将mongo服务器重新启动,本机驱动程序成功重新连接,节点恢复插入数据但缓存的文件(在mongo beeing offline期间)不再插入。
所以我质疑这个缓冲区及其用途。
目标:
我们正在寻找将插入保留在缓冲区中的最佳方法,直到mongo回来(根据wtimeout
在15000毫秒内),然后插入缓冲的文档或使用xml.pause();
和{{1我们尝试过没有成功。
基本上我们需要一些帮助来解决如何在没有数据丢失或中断的情况下处理断开连接的问题。
答案 0 :(得分:2)
使用insertOne()插入500K元素是一个非常糟糕的主意。您应该使用允许您在单个请求中插入多个文档的bulk operations。 (这里例如10000,所以可以在50个单个请求中完成) 为避免缓冲问题,您可以手动处理它:
bufferMaxEntries: 0
reconnectTries: 30, reconnectInterval: 1000
这是一个示例脚本:
var fs = require('fs')
var Xml = require('xml-stream')
var MongoClient = require('mongodb').MongoClient
var url = 'mongodb://localhost:27017/test'
MongoClient.connect(url, {
reconnectTries: 30,
reconnectInterval: 1000,
bufferMaxEntries: 0
}, function (err, db) {
if (err != null) {
console.log('connect error: ' + err)
} else {
var collection = db.collection('product')
var bulk = collection.initializeUnorderedBulkOp()
var totalSize = 500001
var size = 0
var fileStream = fs.createReadStream('data.xml')
var xml = new Xml(fileStream)
xml.on('endElement: product', function (product) {
bulk.insert(product)
size++
// if we have enough product, save them using bulk insert
if (size % 10000 == 0) {
xml.pause()
bulk.execute(function (err, result) {
if (err == null) {
bulk = collection.initializeUnorderedBulkOp()
console.log('doc ' + (size - 10000) + ' : ' + size + ' saved on first try')
xml.resume()
} else {
console.log('bulk insert failed: ' + err)
counter = 0
var retryInsert = setInterval(function () {
counter++
bulk.execute(function (err, result) {
if (err == null) {
clearInterval(retryInsert)
bulk = collection.initializeUnorderedBulkOp()
console.log('doc ' + (size - 10000) + ' : ' + size + ' saved after ' + counter + ' tries')
xml.resume()
} else if (err.code === 11000) { // ignore duplicate ID error
clearInterval(retryInsert)
bulk = collection.initializeUnorderedBulkOp()
console.log('doc ' + (size - 10000) + ' : ' + size + ' saved after ' + counter + ' tries')
xml.resume()
} else {
console.log('failed after first try: ' + counter, 'error: ' + err)
}
})
}, 3000) // retry every 3000ms until success
}
})
} else if (size === totalSize) {
bulk.execute(function (err, result) {
if (err == null) {
db.close()
} else {
console.log('bulk insert failed: ' + err)
}
})
}
})
}
})
示例日志输出:
doc 0 : 10000 saved on first try
doc 10000 : 20000 saved on first try
doc 20000 : 30000 saved on first try
[...]
bulk insert failed: MongoError: interrupted at shutdown // mongodb server shutdown
failed after first try: 1 error: MongoError: no connection available for operation and number of stored operation > 0
failed after first try: 2 error: MongoError: no connection available for operation and number of stored operation > 0
failed after first try: 3 error: MongoError: no connection available for operation and number of stored operation > 0
doc 130000 : 140000 saved after 4 tries
doc 140000 : 150000 saved on first try
[...]
答案 1 :(得分:1)
我不知道Mongodb驱动程序和这个条目缓冲区。也许它只保存特定场景中的数据。
因此,我将采用可以与任何数据库配合使用的更通用的方法来回答这个问题。
总而言之,您有两个问题:
要处理第一个问题,您需要实施一个重试算法,以确保在放弃之前进行多次尝试。
要处理第二个问题,您需要在xml流上实现反压。您可以使用pause
方法,resume
方法和输入缓冲区来完成此操作。
var Promise = require('bluebird');
var fs = require('fs');
var Xml = require('xml-stream');
var fileStream = fs.createReadStream('myFile.xml');
var xml = new Xml(fileStream);
// simple exponential retry algorithm based on promises
function exponentialRetry(task, initialDelay, maxDelay, maxRetry) {
var delay = initialDelay;
var retry = 0;
var closure = function() {
return task().catch(function(error) {
retry++;
if (retry > maxRetry) {
throw error
}
var promise = Promise.delay(delay).then(closure);
delay = Math.min(delay * 2, maxDelay);
return promise;
})
};
return closure();
}
var maxPressure = 100;
var currentPressure = 0;
var suspended = false;
var stopped = false;
var buffer = [];
// handle back pressure by storing incoming tasks in the buffer
// pause the xml stream as soon as we have enough tasks to work on
// resume it when the buffer is empty
function writeXmlDataWithBackPressure(product) {
// closure used to try to start a task
var tryStartTask = function() {
// if we have enough tasks running, pause the xml stream
if (!stopped && !suspended && currentPressure >= maxPressure) {
xml.pause();
suspended = true;
console.log("stream paused");
}
// if we have room to run tasks
if (currentPressure < maxPressure) {
// if we have a buffered task, start it
// if not, resume the xml stream
if (buffer.length > 0) {
buffer.shift()();
} else if (!stopped) {
try {
xml.resume();
suspended = false;
console.log("stream resumed");
} catch (e) {
// the only way to know if you've reached the end of the stream
// xml.on('end') can be triggered BEFORE all handlers are called
// probably a bug of xml-stream
stopped = true;
console.log("stream end");
}
}
}
};
// push the task to the buffer
buffer.push(function() {
currentPressure++;
// use exponential retry to ensure we will try this operation 100 times before giving up
exponentialRetry(function() {
return writeDataToDb(product)
}, 100, 2000, 100).finally(function() {
currentPressure--;
// a task has just finished, let's try to run a new one
tryStartTask();
});
});
// we've just buffered a task, let's try to run it
tryStartTask();
}
// write the product to database here :)
function writeDataToDb(product) {
// the following code is here to create random delays and random failures (just for testing)
var timeToWrite = Math.random() * 100;
var failure = Math.random() > 0.5;
return Promise.delay(timeToWrite).then(function() {
if (failure) {
throw new Error();
}
return null;
})
}
xml.on('endElement: product', writeXmlDataWithBackPressure);
使用它,放一些console.log
来了解它的行为方式。
我希望这可以帮助您解决问题:)