如何使用addToSet提高更新性能

时间:2014-11-13 14:22:26

标签: python mongodb pymongo mongodb-query

我有一个超过600000条记录的集合。所有记录都具有以下结构(来自我的js控制台)

> db.invindex.find({_id: 'katar'}).pretty()
{
    "_id" : "katar",
    "place" : [
        {
            "index" : [
                1
            ],
            "pageid" : ObjectId("53fd177620ba27591133387b")
        },
        {
            "index" : [
                396
            ],
            "pageid" : ObjectId("53dc621420ba2708ea53c0f5")
        },
        {
            "index" : [
                4
            ],
            "pageid" : ObjectId("53f6e20c20ba271058d5c70c")
        },
        {
            "index" : [
                18,
                41
            ],
            "pageid" : ObjectId("53f6e20220ba271055d5c705")
        },
        {
            "index" : [
                3
            ],
            "pageid" : ObjectId("53f6e20420ba27105bd5c702")
        },
        {
            "pageid" : ObjectId("53f6e25220ba271079318d44"),
            "index" : [
                21
            ]
        },
        {
            "pageid" : ObjectId("53f6e43220ba271061d5c81a"),
            "index" : [
                2
            ]
        },
        {
            "pageid" : ObjectId("53f6e43320ba271061d5c81b"),
            "index" : [
                2,
                19
            ]
        },
        {
            "pageid" : ObjectId("53f6e42f20ba27105bd5c7ff"),
            "index" : [
                23
            ]
        },
        {
            "pageid" : ObjectId("53f6e4dd20ba27105bd5c860"),
            "index" : [
                8,
                24
            ]
        },
        {
            "pageid" : ObjectId("53f6e63a20ba271076318ed4"),
            "index" : [
                6,
                18
            ]
        },
        {
            "pageid" : ObjectId("53f6e63b20ba270ffd4533ea"),
            "index" : [
                2
            ]
        },
        {
            "pageid" : ObjectId("53f6e65720ba270ffd4533f6"),
            "index" : [
                428
            ]
        },
        {
            "pageid" : ObjectId("53f6eacb20ba271073319162"),
            "index" : [
                22
            ]
        },
        {
            "pageid" : ObjectId("53f6ed5d20ba271003453786"),
            "index" : [
                304
            ]
        }
    ]
}

place字段经常更新。我想在place子文档中添加一些新记录。下面是我如何完成它的python程序。

def update(doc, pos_list):
    add_to_set = {}
    add_to_set['place'] = {}
    add_to_set['place']['pageid'] = doc._id 
    add_to_set['place']['index'] = pos_list # eg, pos_list = [1,2,3]

    data = self.collection.find_one({'_id': _id})
    # Check if 'pageid' is already there in the record.
    # If not, then update

    self.collection.update(
                        {'_id': _id},
                        {'$addToSet': add_to_set},
                        upsert=upsert, 
                        w=1
                    )

但是这个更新似乎是蜗牛慢! avr每次更新1.3秒。我正在运行10个子进程以提高速度。但这是mongostat

insert  query update delete getmore command flushes mapped  vsize    res faults    locked db idx miss %     qr|qw   ar|aw  netIn netOut  conn       time 
    *0     25     19     *0       0     1|0       0  68.1g   137g   691m    130 spider:65.1%          0       1|0     6|0     5k    63m    37   20:14:30 
    *0      2      1     *0       0     1|0       0  68.1g   137g   629m    194 spider:10.8%          0       0|0     2|0   647b     8m    37   20:14:31 
    *0      5      8     *0       0     1|0       0  68.1g   137g   650m     27 spider:71.9%          0       6|0     1|0     1k     4m    37   20:14:32 
    *0     15     15     *0       0     1|0       0  68.1g   137g   654m    105 spider:52.4%          0       7|0     0|1     4k    32m    37   20:14:34 
    *0      9     12     *0       0     1|0       0  68.1g   137g   666m     38 spider:53.9%          0      11|0     0|1     2k     7m    37   20:14:35 
    *0     16     13     *0       0     1|0       0  68.1g   137g   633m    222 spider:46.6%          0       2|0     1|0     4k    40m    37   20:14:36 
    *0     10     11     *0       0     1|0       0  68.1g   137g   666m    103 spider:39.5%          0       6|0     0|1     2k    22m    37   20:14:38 
    *0     13     15     *0       0     1|0       0  68.1g   137g   655m     20 spider:90.0%          0      14|0     0|1     3k     1m    37   20:14:39 
    *0     18     17     *0       0     1|0       0  68.1g   137g   672m    179 spider:48.5%          0       0|0     5|0     5k    47m    37   20:14:40 
    *0     12     15     *0       0     1|0       0  68.1g   137g   661m    119 spider:40.2%          0       2|0     4|1     3k    15m    37   20:14:41 
insert  query update delete getmore command flushes mapped  vsize    res faults    locked db idx miss %     qr|qw   ar|aw  netIn netOut  conn       time 
    *0      9      8     *0       0     1|0       0  68.1g   137g   682m    109 spider:27.5%          0       6|0     0|1     2k    32m    37   20:14:43 
    *0     19     21     *0       0     1|0       0  68.1g   137g   659m    147 spider:85.6%          0       6|0     2|0     5k    20m    37   20:14:44 
    *0     15     20     *0       0     1|0       0  68.1g   137g   684m    137 spider:63.1%          0       9|0     1|0     4k    12m    37   20:14:45 
    *0     18     13     *0       0     1|0       0  68.1g   137g   634m    197 spider:39.0%          0       0|0     4|0     5k    51m    37   20:14:46 
    *0      6      6     *0       0     1|0       0  68.1g   137g   638m    209 spider:13.9%          0       0|0     2|0     1k     3m    37   20:14:47 
    *0      4      9     *0       0     1|0       0  68.1g   137g   641m    121 spider:37.4%          0      13|0     0|1   866b     3k    37   20:14:49 
    *0     19     15     *0       0     1|0       0  68.1g   137g   618m    141 spider:50.7%          0       0|0     4|0     5k    58m    37   20:14:50 
    *0      2      3     *0       0     1|0       0  68.1g   137g   624m    181 spider:15.4%          0       6|0     0|1   528b   207k    37   20:14:51 

我唯一的索引是_id字段。我有4 GB的RAM。现在,我的问题是

  • 这是正常的速度吗?
  • 如何提高更新速度?

编辑:我的索引

> db.invindex.stats()
{
    "ns" : "spider.invindex",
    "count" : 595306,
    "size" : 1302386304,
    "avgObjSize" : 2187,
    "storageSize" : 1580150784,
    "numExtents" : 19,
    "nindexes" : 1,
    "lastExtentSize" : 415174656,
    "paddingFactor" : 1,
    "systemFlags" : 1,
    "userFlags" : 1,
    "totalIndexSize" : 25648112,
    "indexSizes" : {
        "_id_" : 25648112
    },
    "ok" : 1
}

0 个答案:

没有答案