我在MongoDB馆藏中大约有2000万个文档。现在我想添加一个名为“分数”的字段,分数值是通过机器学习模型计算得出的。 现在,我的任务是遍历MongoDB中的每个文档,计算分数并更新文档。但是问题在于更新过程需要很长时间才能完成。
这是我当前正在使用的代码。
const config = require('./config');
const mongoose = require('mongoose');
const audit = require("./lib/Audit")
const threads = 4;
let completed = 0;
let documents = 0;
mongoose.connect(config.db.uri, {useNewUrlParser: true, useCreateIndex: true});
console.log("Connected to mongodb!");
const Schema = require('./models/Schema.js');
const CliProgress = require('cli-progress');
const bar1 = new CliProgress.Bar({
etaBuffer: 5000,
format: '[{bar}] {percentage}% | ETA: {eta_formatted} | {value}/{total}'
}, CliProgress.Presets.shades_grey);
(async function (ref) {
documents = await Schema.find({'score': null}).count();
completed = 0;
bar1.start(documents, completed);
let corsor = Schema.find({'score': null}).cursor();
corsor.eachAsync((doc) => {
// console.log(update)
doc.lastmod = Date.now();
// Calculating The Score
doc.score = audit(doc.toObject())
doc.save();
// Saving To Database
completed ++;
bar1.update(completed);
return true;
}, {parallel: threads })
})();
看看,让我知道如何加快处理速度?我已经尝试过通过增加线程的值来进行操作,但是它对操作速度几乎没有影响。
答案 0 :(得分:0)
我有一个解决方案可以大大加快这一过程!通过以这种方式使用批量操作 就我而言,此方法快20倍。
const config = require('./config');
const mongoose = require('mongoose');
const Audit = require("./lib/Audit");
const ObjectId = require('mongodb').ObjectId;
let completed = 0;
let documents = 0;
let timeouts = null;
mongoose.connect(config.db.uri, {useNewUrlParser: true, useCreateIndex: true});
console.log("Connected to mongodb!");
console.log('\033[2J');
const Schema = require('./models/Schema.js');
const CliProgress = require('cli-progress');
const bar1 = new CliProgress.Bar({
etaBuffer: 5000,
format: '[{bar}] {percentage}% | ETA: {eta_formatted} | {value}/{total}'
}, CliProgress.Presets.shades_grey);
(async function (ref) {
documents = await Schema.find({}).count();
completed = 0;
bar1.start(documents, completed);
let corsor = Schema.find().lean().cursor();
let bulk = Schema.collection.initializeOrderedBulkOp();
corsor.on("data", (doc) => {
// console.log(update)
doc.lastmod = Date.now();
doc.swarm.verified = true;
if(doc.swarm.seeders || doc.swarm.leechers){
doc.swarm.audit = Audit(doc);
}
bulk.find({_id: ObjectId(doc._id)}).updateOne({ $set: { "swarm": doc.swarm, lastmod: doc.lastmod }});
completed ++;
bar1.update(completed);
return true;
});
corsor.on('end',() => {
clearTimeout(timeouts);
bulk.execute(()=> {
console.log('\033[2J');
console.log('Task Completed!');
setTimeout(()=> {
process.exit();
}, 5000)
});
});
// {parallel: 200}
function start(){
timeouts = setTimeout(() => {
doTasks()
}, 120000);
}
start();
function doTasks() {
corsor.pause();
clearTimeout(timeouts);
console.log('\033[2J');
console.log('Updating Documents.....');
console.log('Waiting For Complete.....');
bulk.execute(()=> {
// Done Writing Documents
start()
console.log('\033[2J');
console.log('Resuming task........');
// Start another Bulk Operation
bulk = Schema.collection.initializeOrderedBulkOp();
// Resume The cursor
corsor.resume();
});
}
})();