我有一个定期运行的功能,可以更新item.price
集合中某些Documents
的{{1}}。 Prices
有100k +项。该函数如下所示:
Price Collection
正如你所看到的,为了避免迭代100k +文档,并分别更新它们中的每一个,我在开始时将它们全部删除,只需调用给我这些项目的API,然后使用{{ 1}}将所有这些插入我的价格集合中。
此更新过程每30分钟发生一次。
但我刚才意识到,如果某个用户想检查价格并且我的 //Just a helper function for multiple GET requests with request.
let _request = (urls, cb) => {
let results = {}, i = urls.length, c = 0;
handler = (err, response, body) => {
let url = response.request.uri.href;
results[url] = { err, response, body };
if (++c === urls.length) {
cb(results);
}
};
while (i--) {
request(urls[i], handler);
}
};
// function to update the prices in our Prices collection.
const update = (cb) => {
Price.remove({}, (err, remove) => {
if (err) {
return logger.error(`Error removing items...`);
}
logger.info(`Removed all items... Beginning to update.`);
_request(urls, (responses) => {
let url, response, gameid;
for (url in responses) {
id = url.split('/')[5].split('?')[0];
response = responses[url];
if (response.err) {
logger.error(`Error in request to ${url}: ${err}`);
return;
}
if (response.body) {
logger.info(`Request to ${url} successful.`)
let jsonResult = {};
try {
jsonResult = JSON.parse(response.body);
} catch (e) {
logger.error(`Could not parse.`);
}
logger.info(`Response body for ${id} is ${Object.keys(jsonResult).length}.`);
let allItemsArray = Object.keys(jsonResult).map((key, index) => {
return {
itemid: id,
hash_name: key,
price: jsonResult[key]
}
});
Price.insertMany(allItemsArray).then(docs => {
logger.info(`Saved docs for ${id}`)
}, (e) => {
logger.error(`Error saving docs.`);
});
}
}
if (cb && typeof cb == 'function') {
cb();
}
})
});
}
目前是空的,因为它正在更新自己呢?
问题
所以我必须遍历所有这些以便不删除它吗? (请记住,有很多文档每30分钟更新一次。)或者还有其他解决方案吗?
这是我的InsertMany
看起来如何的图片(有100k这样的文档,我只想更新价格属性):
更新:
我重新编写了Prices Collection
函数,现在它看起来像这样:
Prices Collection
现在注意批量变量(感谢@Rahul),但现在,该集合需要很长时间才能更新。我的处理器正在燃烧,它需要3分钟以上来更新60k +文档。老实说,我感觉像以前的方法一样,虽然它可能会删除所有这些方法然后重新插入它们,但它也需要快10倍。
任何?
答案 0 :(得分:5)
根据我的经验(每小时更新数百万个mongo文档),这是一个非常大的批量更新的现实方法:
mongoimport
并将该bson文件导入新的空集合prices_new
。 Javascript,更不用说高级OO包装器,对于那个prices_new
- > prices
dropTarget=true
(这将是原子的,因此没有停机时间)示意图,它在JS
中看起来像这样let fname = '/tmp/data.bson';
let apiUrls = [...];
async function doRequest(url) {
// perform a request and return an array of records
}
let responses = await Promise.all(apiUrls.map(doRequest));
// if the data too big to fit in memory, use streams instead of this:
let data = flatMap(responses, BSON.serialize).join('\n'));
await fs.writeFile(fname, data);
await child_process.exec(`mongoimport --collection prices_new --drop ${fname}`);
await db.prices_new.renameCollection('prices', true);
答案 1 :(得分:2)
答案 2 :(得分:1)
我没有测试任何东西,但你可以尝试这个,可能会有所帮助。我正在使用bluebird库进行并发。
let _request = (url) => {
return new Promise((resolve, reject) => {
request(url, (err, response, body) => {
if (err) {
reject(err);
}
resolve(body);
});
});
};
const formatRespose = async (response) => {
// do stuff
return {
query: {}, // itemid: id,
body: {}
};
}
const bulkUpsert = (allItemsArray) => {
let bulk = Price.collection.initializeUnorderedBulkOp();
return new Promise((resolve, reject) => {
allItemsArray.forEach(item => {
bulk.find(item.query).upsert().updateOne(item.body);
});
bulk.execute((err, bulkers) => {
if (err) {
return reject(err);
}
return resolve(bulkers);
});
});
}
const getAndUpdateData = async (urls) => {
const allItemsArray = urls.map((url) => {
const requestData = await _request(url); // you can make this also parallel
const formattedData = formatRespose(requestData); // return {query: {},body: {} };
return formattedData;
});
return await (bulkUpsert(allItemsArray));
};
function update() {
// split urls into as per your need 100/1000
var i, j, chunkUrls = [],
chunk = 100;
for (i = 0, j = urls.length; i < j; i += chunk) {
chunkUrls.push(getAndUpdateData(urls.slice(i, i + chunk)));
}
Bluebird.map(chunkUrls, function (chunk) {
return await chunk;
}, {
concurrency: 1 // depends on concurrent request change 1 = 100 request get and insert in db at time
}).then(function () {
console.log("done");
}).catch(function () {
console.log("error");
});
}