问题:如何通过在字段上进行状态检查来批量插入百万条记录
情况:我必须多次运行独立节点脚本,并获取需要插入Mongo DB的JSON数据。此JSON数据可以具有先前插入的相同数据,也可以在需要更新的一个字段(计数)中进行更改。
字段'count'可以大于,等于或小于现有条目。
要求:我需要更新该文档(行),只有当计数比现有文件更强。
我应该能够通过获取现有记录然后循环每条记录并进行条件检查来完成此操作。这不是我想要的,因为将有数百万条记录要插入。
问题1 。 mongoose bulkWrite
有没有办法在upsert
之前进行这样的条件检查?
问题2 。如果bulkWrite
不是我应该使用的东西
是替代解决方案,具有更好的性能和更少
应用程序内存开销?
MongoDB服务器版本:3.4.10
下面的示例代码插入2条记录,经过一段时间的延迟后,它会尝试插入另一条只有计数更改的记录,该记录小于DB中的现有条目。
在这种情况下,我的要求是不更新第二个记录计数值。
问题3:为什么DB中没有设置默认值字段?
问题4:在处理大量数据时,是否有事件要知道是否所有记录都写入磁盘?由于数据库保存是“async
”调用,何时触发database.close
和process.exit()
,当前代码只是等待5秒,这是错误的实现方式。
如果我不关闭数据库,应用程序将不会退出,如果我退出应用程序超时,可能会导致在将完整数据写入磁盘之前退出代码。
/**
* Get DB Connection
*/
const mongoose = require("mongoose");
mongoose.Promise = global.Promise;
const url = "mongodb://127.0.0.1:27017/sample";
let DATABASE = {};
DATABASE.connect = () => {
let db = mongoose.createConnection(url, { autoIndex: false, connectTimeoutMS: 30000, reconnectTries: 30, reconnectInterval: 2000 });
console.log("Database connecting to URL ", url);
db.on('error', console.error.bind(console, 'connection error:'));
return db;
}
DATABASE.close = (db) => {
if (db) {
db.close(function () {
console.log('Mongoose default connection with DB disconnected ');
});
}
}
/**
* Now define schema (will be part of module file)
*/
const Schema = mongoose.Schema;
var apiDetailSchema = new Schema({
ip: String,
instance: Number,
component: String,
logDate: Number,
count: Number,
apiMethod: {type:String, default:'DEFAULT METHOD', required: true},
api: String,
status: String,
httpMethod: String,
environment: String,
datacenter: {type:String, default:'INDIA', required:true},
});
apiDetailSchema.index({ ip: 1, instance: 1, component: 1, logDate: 1, api: 1, status: 1, httpMethod:1, environment:1, datacenter:1}, { unique: true });
const API_DETAIL = {};
API_DETAIL.connect = () => {
if (API_DETAIL.db) {
console.log("Returning existing DB for API Schema");
return API_DETAIL.db;
}
console.log("Requesting New DB connection for API Schema");
API_DETAIL.db = DATABASE.connect();
return API_DETAIL.db;
}
API_DETAIL.close = () => {
if (API_DETAIL.db) DATABASE.close(API_DETAIL.db);
}
API_DETAIL.connect();
API_DETAIL.SCHEMA = API_DETAIL.db.model('apiDetail', apiDetailSchema);
/**
* Use of API_DETAIL to insert data
*/
var bulkUpdateApiData = (data) => {
let total = data.length;
return new Promise((resolve, reject) => {
if (total === 0) {
resolve("NO DATA to update API details");
}
console.log("Bulkupdating "+total+" API records");
let db = API_DETAIL.connect(); // Connect to DB
if (!db) {
console.log("Failed to obtain DB connection during API Bulk update");
reject("ERROR: DB Connection failed");
}
let bulkOps = [];
console.log("Going to Bulk update "+total+" API details");
data.forEach(d => {
let { ip, instance, component, logDate, count, api, status, httpMethod, environment, datacenter } = d;
let upsertDoc = {
'updateOne': {
// Filter applied to all field except count, so that it will update count
// TODO: Check if the count is more, then only update
'filter': { ip, instance, component, logDate, api, status, httpMethod, environment, datacenter },
'update': d,
'upsert': true
}
}
bulkOps.push(upsertDoc);
});
API_DETAIL.SCHEMA.bulkWrite(bulkOps).then(BulkWriteOpResultObject => {
console.log(total + " API Details updated to DB");
// console.log(JSON.stringify(BulkWriteOpResultObject, null, 2));
resolve("Updated "+total+ " API Details");
}).catch(e => {
console.log("ERROR upserting addIpDetail", e);
reject(e);
});
});
} // Function : bulkUpdateApiData
let initialData = [
{
"ip": "192.168.1.2",
"instance": 2,
"component": "NODE",
"logDate": "20180114",
"api": "/services/srest/abc/authenticator/login",
"status": "200",
"httpMethod": "POST",
"environment": "production",
"count": 8
},
{
"ip": "192.168.1.2",
"instance": 2,
"component": "NODE",
"logDate": "20180114",
"api": "/services/srest/abc/authenticator/logout",
"status": "204",
"httpMethod": "POST",
"environment": "production",
"count": 8888 // Initially it was more
}];
bulkUpdateApiData(initialData).then(output => {
console.log(output);
}).catch(e => {
console.log("Something went wrong during API Detail bulk update", e);
});
let newData = [
{
"ip": "192.168.1.2",
"instance": 2,
"component": "NODE",
"logDate": "20180114",
"api": "/services/srest/abc/authenticator/logout",
"status": "204",
"httpMethod": "POST",
"environment": "production",
"count": 10 // Now it is lesser than initial one
}];
// Wait for 2 seconds to complete previous write operation,
// if not below bulkWrite will complete first !!!
setTimeout(() => {
console.log("=================================================");
console.log("Bulk updating EXISTING data with lesser count");
bulkUpdateApiData(newData).then(output => {
console.log(output);
}).catch(e => {
console.log("Something went wrong during API Detail bulk update", e);
});
}, 2000);
console.log("-----------------------------------------------");
// As DB write / save is async operation, When should I call this CLOSE connection?
// Is there a way to know when exactly DB write is completed?
setTimeout(API_DETAIL.close, 5000);
输出:
> node bulkWrite.js
Requesting New DB connection for API Schema
Database connecting to URL mongodb://127.0.0.1:27017/sample
Bulkupdating 2 API records
Returning existing DB for API Schema
Going to Bulk update 2 API details
-----------------------------------------------
2 API Details updated to DB
Updated 2 API Details
=================================================
Bulk updating EXISTING data with lesser count
Bulkupdating 1 API records
Returning existing DB for API Schema
Going to Bulk update 1 API details
1 API Details updated to DB
Updated 1 API Details
Mongoose default connection with DB disconnected
数据库输出,其中第二个记录/文档在提交的“计数”中具有更新值
> db.apidetails.find().pretty()
{
"_id" : ObjectId("5a5df2d1952021f65578fc8f"),
"api" : "/services/srest/abc/authenticator/login",
"component" : "NODE",
"count" : 8,
"datacenter" : null,
"environment" : "production",
"httpMethod" : "POST",
"instance" : 2,
"ip" : "192.168.1.2",
"logDate" : 20180114,
"status" : "200"
}
{
"_id" : ObjectId("5a5df2d1952021f65578fc90"),
"api" : "/services/srest/abc/authenticator/logout",
"component" : "NODE",
"count" : 10,
"datacenter" : null,
"environment" : "production",
"httpMethod" : "POST",
"instance" : 2,
"ip" : "192.168.1.2",
"logDate" : 20180114,
"status" : "204"
}