我正在使用python使用documentDB模块使用新的观察结果更新许多数据文件。我必须每分钟上载100-200个json文件,并且上载操作比该程序的其余部分占用更多时间。现在,我正在使用模块中DocumentClient中的“ UpsertDocument”功能。有更快/更好的方法吗?
答案 0 :(得分:0)
您可以使用存储过程进行批量upsert操作:
function bulkimport2(docObject) {
var collection = getContext().getCollection();
var collectionLink = collection.getSelfLink();
// The count of imported docs, also used as current doc index.
var count = 0;
getContext().getResponse().setBody(docObject.items);
//return
// Validate input.
//if (!docObject.items || !docObject.items.length) getContext().getResponse().setBody(docObject);
docObject.items=JSON.stringify(docObject.items)
docObject.items = docObject.items.replace("\\\\r", "");
docObject.items = docObject.items.replace("\\\\n", "");
var docs = JSON.parse(docObject.items);
var docsLength = docObject.items.length;
if (docsLength == 0) {
getContext().getResponse().setBody(0);
return;
}
// Call the CRUD API to create a document.
tryCreate(docs[count], callback, collectionLink,count);
// Note that there are 2 exit conditions:
// 1) The createDocument request was not accepted.
// In this case the callback will not be called, we just call setBody and we are done.
// 2) The callback was called docs.length times.
// In this case all documents were created and we don't need to call tryCreate anymore. Just call setBody and we are done.
function tryCreate(doc, callback, collectionLink,count ) {
doc=JSON.stringify(doc);
if (typeof doc == "undefined") {
getContext().getResponse().setBody(count);
return ;
} else {
doc = doc.replace("\\r", "");
doc = doc.replace("\\n", "");
doc=JSON.parse(doc);
}
getContext().getResponse().setBody(doc);
var isAccepted = collection.upsertDocument(collectionLink, doc, callback);
// If the request was accepted, callback will be called.
// Otherwise report current count back to the client,
// which will call the script again with remaining set of docs.
// This condition will happen when this stored procedure has been running too long
// and is about to get cancelled by the server. This will allow the calling client
// to resume this batch from the point we got to before isAccepted was set to false
if (!isAccepted) {
getContext().getResponse().setBody(count);
}
}
// This is called when collection.createDocument is done and the document has been persisted.
function callback(err, doc, options) {
if (err) throw getContext().getResponse().setBody(err + doc);
// One more document has been inserted, increment the count.
count++;
if (count >= docsLength) {
// If we have created all documents, we are done. Just set the response.
getContext().getResponse().setBody(count);
return ;
} else {
// Create next document.
tryCreate(docs[count], callback, collectionLink,count);
}
}
,然后可以加载python并执行它。请注意,存储过程需要分区键。
希望有帮助。
答案 1 :(得分:0)
一种选择是改为使用Cosmos DB Spark连接器,并可选地(方便地)在Azure Databricks中作为作业运行。这将对您的吞吐量提供大量控制,并使您可以轻松地在Cosmos DB上的并行度(我认为这是问题)和RU容量之间找到最佳平衡。
下面是一个简单的示例,它测量了装入11.8万份文档的情况,这是使用最小规格的Databricks集群(仅包含1名工作人员)
Python中的单个Cosmos客户端:28 docs / sec @ 236 RU(即完全不推送Cosmos)
Spark Cosmos DB适配器,> 200 RUs,每秒66个文档(由于400 RU的限制而受到限制)
...将Cosmos DB提升至10K RU之后 Spark Cosmos DB Adapter,1317 docs / sec @> 2.9K RU(不要为精确的RU运行足够长的时间)-仍然是相同的最低规格集群
您还可以尝试使用Python多线程(我认为这会有所帮助),正如CYMA在评论中所说,您应该在Cosmos DB中检查节流。不过,我的观察结果是,一个Cosmos客户端无法让您获得最低400 RU的报酬。