我有一个超过600000条记录的集合。所有记录都具有以下结构(来自我的js控制台)
> db.invindex.find({_id: 'katar'}).pretty()
{
"_id" : "katar",
"place" : [
{
"index" : [
1
],
"pageid" : ObjectId("53fd177620ba27591133387b")
},
{
"index" : [
396
],
"pageid" : ObjectId("53dc621420ba2708ea53c0f5")
},
{
"index" : [
4
],
"pageid" : ObjectId("53f6e20c20ba271058d5c70c")
},
{
"index" : [
18,
41
],
"pageid" : ObjectId("53f6e20220ba271055d5c705")
},
{
"index" : [
3
],
"pageid" : ObjectId("53f6e20420ba27105bd5c702")
},
{
"pageid" : ObjectId("53f6e25220ba271079318d44"),
"index" : [
21
]
},
{
"pageid" : ObjectId("53f6e43220ba271061d5c81a"),
"index" : [
2
]
},
{
"pageid" : ObjectId("53f6e43320ba271061d5c81b"),
"index" : [
2,
19
]
},
{
"pageid" : ObjectId("53f6e42f20ba27105bd5c7ff"),
"index" : [
23
]
},
{
"pageid" : ObjectId("53f6e4dd20ba27105bd5c860"),
"index" : [
8,
24
]
},
{
"pageid" : ObjectId("53f6e63a20ba271076318ed4"),
"index" : [
6,
18
]
},
{
"pageid" : ObjectId("53f6e63b20ba270ffd4533ea"),
"index" : [
2
]
},
{
"pageid" : ObjectId("53f6e65720ba270ffd4533f6"),
"index" : [
428
]
},
{
"pageid" : ObjectId("53f6eacb20ba271073319162"),
"index" : [
22
]
},
{
"pageid" : ObjectId("53f6ed5d20ba271003453786"),
"index" : [
304
]
}
]
}
place
字段经常更新。我想在place
子文档中添加一些新记录。下面是我如何完成它的python程序。
def update(doc, pos_list):
add_to_set = {}
add_to_set['place'] = {}
add_to_set['place']['pageid'] = doc._id
add_to_set['place']['index'] = pos_list # eg, pos_list = [1,2,3]
data = self.collection.find_one({'_id': _id})
# Check if 'pageid' is already there in the record.
# If not, then update
self.collection.update(
{'_id': _id},
{'$addToSet': add_to_set},
upsert=upsert,
w=1
)
但是这个更新似乎是蜗牛慢! avr每次更新1.3秒。我正在运行10个子进程以提高速度。但这是mongostat
insert query update delete getmore command flushes mapped vsize res faults locked db idx miss % qr|qw ar|aw netIn netOut conn time
*0 25 19 *0 0 1|0 0 68.1g 137g 691m 130 spider:65.1% 0 1|0 6|0 5k 63m 37 20:14:30
*0 2 1 *0 0 1|0 0 68.1g 137g 629m 194 spider:10.8% 0 0|0 2|0 647b 8m 37 20:14:31
*0 5 8 *0 0 1|0 0 68.1g 137g 650m 27 spider:71.9% 0 6|0 1|0 1k 4m 37 20:14:32
*0 15 15 *0 0 1|0 0 68.1g 137g 654m 105 spider:52.4% 0 7|0 0|1 4k 32m 37 20:14:34
*0 9 12 *0 0 1|0 0 68.1g 137g 666m 38 spider:53.9% 0 11|0 0|1 2k 7m 37 20:14:35
*0 16 13 *0 0 1|0 0 68.1g 137g 633m 222 spider:46.6% 0 2|0 1|0 4k 40m 37 20:14:36
*0 10 11 *0 0 1|0 0 68.1g 137g 666m 103 spider:39.5% 0 6|0 0|1 2k 22m 37 20:14:38
*0 13 15 *0 0 1|0 0 68.1g 137g 655m 20 spider:90.0% 0 14|0 0|1 3k 1m 37 20:14:39
*0 18 17 *0 0 1|0 0 68.1g 137g 672m 179 spider:48.5% 0 0|0 5|0 5k 47m 37 20:14:40
*0 12 15 *0 0 1|0 0 68.1g 137g 661m 119 spider:40.2% 0 2|0 4|1 3k 15m 37 20:14:41
insert query update delete getmore command flushes mapped vsize res faults locked db idx miss % qr|qw ar|aw netIn netOut conn time
*0 9 8 *0 0 1|0 0 68.1g 137g 682m 109 spider:27.5% 0 6|0 0|1 2k 32m 37 20:14:43
*0 19 21 *0 0 1|0 0 68.1g 137g 659m 147 spider:85.6% 0 6|0 2|0 5k 20m 37 20:14:44
*0 15 20 *0 0 1|0 0 68.1g 137g 684m 137 spider:63.1% 0 9|0 1|0 4k 12m 37 20:14:45
*0 18 13 *0 0 1|0 0 68.1g 137g 634m 197 spider:39.0% 0 0|0 4|0 5k 51m 37 20:14:46
*0 6 6 *0 0 1|0 0 68.1g 137g 638m 209 spider:13.9% 0 0|0 2|0 1k 3m 37 20:14:47
*0 4 9 *0 0 1|0 0 68.1g 137g 641m 121 spider:37.4% 0 13|0 0|1 866b 3k 37 20:14:49
*0 19 15 *0 0 1|0 0 68.1g 137g 618m 141 spider:50.7% 0 0|0 4|0 5k 58m 37 20:14:50
*0 2 3 *0 0 1|0 0 68.1g 137g 624m 181 spider:15.4% 0 6|0 0|1 528b 207k 37 20:14:51
我唯一的索引是_id
字段。我有4 GB的RAM。现在,我的问题是
编辑:我的索引
> db.invindex.stats()
{
"ns" : "spider.invindex",
"count" : 595306,
"size" : 1302386304,
"avgObjSize" : 2187,
"storageSize" : 1580150784,
"numExtents" : 19,
"nindexes" : 1,
"lastExtentSize" : 415174656,
"paddingFactor" : 1,
"systemFlags" : 1,
"userFlags" : 1,
"totalIndexSize" : 25648112,
"indexSizes" : {
"_id_" : 25648112
},
"ok" : 1
}