方案: 集合A有4000万条记录,每条记录有近20个字段。 从A中获取5个(已定义的)字段并更改字段名称并填充在集合B中。
示例:
A “_id”是这里的主键
{
"_id":123
"id":123
"title":"test"
"summary": "test"
"version":1
"parentid":12
}
乙
{
"_id":123
"p$id":123
"p$parentid":12
"p$title":"test"
}
有人可以建议为这种情况编写代码的好方法吗?
我编写了代码,但需要5小时才能完成。 我的代码:
它包含所有与Mongo DB相关的详细信息。
from pymongo import MongoClient
import operator
import datetime
print "Start time", datetime.datetime.now()
primary_dict = {}
primary_list = []
secondary_dict = {}
secondary_list = []
missing_id = []
mismatch_id = []
alias_dict = {
"_id": "_id",
"id":"p$id"
"title": "p$title"
"parentid":"p$parentid"
}
def mongo_connect(host, port, db, collection):
client = MongoClient(host, port)
db_obj = client[db]
collection_obj = db_obj[collection]
return collection_obj
def primary():
global primary_list
global primary_dict
global secondary_dict
global secondary_list
global missing_id
primary_collection = mongo_connect(config.mongo_host, config.mongo_port, config.mongo_primary_db, config.mongo_primary_collection)
secondary_collection = mongo_connect(config.mongo_host, config.mongo_port, config.mongo_secondary_db, config.mongo_secondary_collection)
for dict1 in primary_collection.find({},{"_id":1,"title":1}).batch_size(1000):
count = 0
target_id = ''
primary_list = []
secondary_list = []
target_id = dict1['_id']
primary_list.insert(count, dict1)
if (secondary_collection.find_one({"_id":target_id})) is None:
missing_id.append(target_id)
continue
else:
secondary_list.insert(count,secondary_collection.find_one({"_id":target_id}))
compare(primary_list, secondary_list)
def compare(list1, list2):
global alias_dict
global mismatch_id
global missing_id
for l1, l2 in zip(primary_list,secondary_list):
if len(l1) != len(l2):
mismatch_id.append(l1['_id'])
continue
else:
for key, value in l1.items():
if value != l2[alias_dict[key]]:
mismatch_id.append(l1['_id'])
primary()
print "Mismatch id list", mismatch_id
print "Missing Id list", missing_id
print "End time", datetime.datetime.now()
答案 0 :(得分:0)
嗯,你可以这样做:
db.eval(function(){
db.primary_collection.find({},
{ id: 1, parentid: 1, title: 1 }).forEach(function(doc){
var newDoc = {};
Object.keys(doc).forEach(function(key) {
var newKey = ( key == "_id" ) ? key : "p$" + key;
newDoc[newKey] = doc[key];
});
db.secondary_collection.insert(newDoc);
});
})
使用db.eval()
执行服务器上的代码,这将与您获得的速度一样快。
但请阅读相关文档,因为在此操作发生时您将“锁定”数据库。当然,如果这是你的意图,你不能跨服务器这样做。