删除mongodb中的重复值

时间:2015-12-22 11:05:36

标签: python mongodb mongodb-query pymongo

我正在使用龙卷风使用python学习mongodb。当我这样做时,我有一个mongodb集合

db.cal.find()

{     
    "Pid" : "5652f92761be0b14889d9854",
    "Registration" : "TN 56 HD 6766",
    "Vid" : "56543ed261be0b0a60a896c9",
    "Period" : "10-2015",
    "AOs": [
        "14-10-2015",
        "15-10-2015",
        "18-10-2015",
        "14-10-2015",
        "15-10-2015",
        "18-10-2015"
    ],
    "Booked": [
        "5-10-2015",
        "7-10-2015",
        "8-10-2015",
        "5-10-2015",
        "7-10-2015",
        "8-10-2015"
    ],
    "NA": [
        "1-10-2015",
        "2-10-2015",
        "3-10-2015",
        "4-10-2015",
        "1-10-2015",
        "2-10-2015",
        "3-10-2015",
        "4-10-2015"
    ],

    "AOr": [
        "23-10-2015",
        "27-10-2015",
        "23-10-2015",
        "27-10-2015"
    ]
}

我需要一个操作来从Booked,NA,AOs,AOr中删除重复值。最后它应该是

{
     "Pid" : "5652f92761be0b14889d9854",
      "Registration" : "TN 56 HD 6766",
      "Vid" : "56543ed261be0b0a60a896c9",
      "AOs": [
        "14-10-2015",
        "15-10-2015",
        "18-10-2015",

      ],
      "Booked": [
        "5-10-2015",
        "7-10-2015",
        "8-10-2015",

      ],

      "NA": [
        "1-10-2015",
        "2-10-2015",
        "3-10-2015",
        "4-10-2015",

      ],

      "AOr": [
        "23-10-2015",
        "27-10-2015",

      ]
}

我如何在mongodb中实现这一目标?

4 个答案:

答案 0 :(得分:0)

工作解决方案

我已经创建了一个基于JavaScript的工作解决方案,可以在mongo shell上找到:

var codes = ["AOs", "Booked", "NA", "AOr"]

// Use bulk operations for efficiency
var bulk = db.dupes.initializeUnorderedBulkOp()

db.dupes.find().forEach(
  function(doc) {

    // Needed to prevent unnecessary operatations
    changed = false
    codes.forEach(
      function(code) {
        var values = doc[code]
        var uniq = []

        for (var i = 0; i < values.length; i++) {
          // If the current value can not be found, it is unique
          // in the "uniq" array after insertion
          if (uniq.indexOf(values[i]) == -1 ){
            uniq.push(values[i])
          }
        }

        doc[code] = uniq

        if (uniq.length < values.length) {
          changed = true
        }

      }
    )

    // Update the document only if something was changed
    if (changed) {
      bulk.find({"_id":doc._id}).updateOne(doc)
    }
  }
)

// Apply all changes
bulk.execute()

带有示例输入的结果文档:

replset:PRIMARY> db.dupes.find().pretty()
{
  "_id" : ObjectId("567931aefefcd72d0523777b"),
  "Pid" : "5652f92761be0b14889d9854",
  "Registration" : "TN 56 HD 6766",
  "Vid" : "56543ed261be0b0a60a896c9",
  "Period" : "10-2015",
  "AOs" : [
    "14-10-2015",
    "15-10-2015",
    "18-10-2015"
  ],
  "Booked" : [
    "5-10-2015",
    "7-10-2015",
    "8-10-2015"
  ],
  "NA" : [
    "1-10-2015",
    "2-10-2015",
    "3-10-2015",
    "4-10-2015"
  ],
  "AOr" : [
    "23-10-2015",
    "27-10-2015"
  ]
}

使用带有dropDups

的索引

这根本行不通。首先,根据3.0版,此选项不再存在。由于我们已经发布了3.2版本,我们应该找到一种可移植的方式。

其次,即使使用dropDups,文档也明确指出:

  

dropDups 布尔值:MongoDB仅对第一次出现的键进行索引,并从集合中删除所有包含该键的后续出现的 文档

因此,如果另一个文件在其中一个帐单代码中具有与前一个相同的值,则整个文档将被删除。

答案 1 :(得分:0)

您不能首先使用“dropDups”语法,因为它已从MongoDB 2.6中“弃用”并在MongoDB 3.0中删除,甚至无法使用。

要从每个列表中删除副本,您需要在python中使用set类。

Bulk()

MongoDB 3.2 deprecates from pymongo import UpdateOne requests = [] # list of write operations for document in collection.find(): update = dict(zip(fields, [list(set(document[field])) for field in fields])) requests.append(UpdateOne({'_id': document['_id']}, {'$set': update})) collection.bulk_write(requests) 及其相关方法,并提供.bulkWrite()方法。这种方法可以从Pymongo 3.2获得bulk_write()。使用此方法要做的第一件事是导入UpdateOne类。

{'AOr': ['27-10-2015', '23-10-2015'],
 'AOs': ['15-10-2015', '14-10-2015', '18-10-2015'],
 'Booked': ['7-10-2015', '5-10-2015', '8-10-2015'],
 'NA': ['1-10-2015', '4-10-2015', '3-10-2015', '2-10-2015'],
 'Period': '10-2015',
 'Pid': '5652f92761be0b14889d9854',
 'Registration': 'TN 56 HD 6766',
 'Vid': '56543ed261be0b0a60a896c9',
 '_id': ObjectId('567f808fc6e11b467e59330f')}

这两个查询给出了相同和预期的结果:

#include <iostream>
#include <vector>
#include <string>

using namespace std;

void input(string &largeString1, string &largeString2);
void convert(string largeString1, string largeString2, vector<int> &largeInt1, vector<int> &largeInt2);
int asciiToInt(char ch);
void add(vector<int> largeInt1, vector<int> largeInt2, vector<int> &finalInt);
void output(const vector<int> finalInt);

int main()
{
    string largeString1;
    string largeString2;
    vector<int> largeInt1(12, 0);
    vector<int> largeInt2(12, 0);
    vector<int> finalInt(13, 0);

    for (int i = 0; i < 4; i++)
    {
        input(largeString1, largeString2);
        convert(largeString1, largeString2, largeInt1, largeInt2);
        add(largeInt1, largeInt2, finalInt);
        output(finalInt);
    }

    system("pause");

    return 0;
}
void input(string &largeString1, string &largeString2)
{
    cout << "Input:" << endl << endl;
    cin >> largeString1;
    cin >> largeString2;
}
void convert(string largeString1, string largeString2, vector<int> &largeInt1, vector<int> &largeInt2)
{
    int size1 = size(largeString1);
    int size2 = size(largeString2);
    for (int i = 0; i < 12; i++)
    {
        int dynamicsize1 = size1 - i;
        largeInt1[11 - i] = asciiToInt(largeString1[dynamicsize1 ]);
    }
    for (int j = 0; j < 12; j++)
    {
        int dynamicsize2 = size2 - j;
        largeInt2[11 - j] = asciiToInt(largeString2[dynamicsize2 ]);
    }
}
int asciiToInt(char ch)
{
    return (ch - '0');
}
void add(vector<int> largeInt1, vector<int> largeInt2, vector<int> &finalInt)
{
    for (int i = 0; i < 13; i++)
    {
        finalInt[12 - i] = largeInt1[11 - i] + largeInt2[11 - i];
    }
}
void output(const vector<int> finalInt)
{
    cout << endl << "Output:" << endl << endl << "The sum is: ";
    for (int i = 0; i < 13; i++)
    {
        cout << finalInt[i];
    }
}

答案 2 :(得分:-1)

你试过“Distinct()”吗?

链接:https://docs.mongodb.org/v3.0/reference/method/db.collection.distinct/

使用不同的

指定查询

以下示例从dept等于“A”的文档中返回项目字段中嵌入的字段sku的不同值:

db.inventory.distinct( "item.sku", { dept: "A" } )

该方法返回以下不同sku值的数组:

[ "111", "333" ]

答案 3 :(得分:-1)

假设您要从集合中删除重复日期,因此您可以使用dropDups:true选项添加唯一索引:

db.bill_codes.ensureIndex({"fieldName":1}, {unique: true, dropDups: true}) 

更多参考: db.collection.ensureIndex() - MongoDB Manual 3.0

注意:请先备份您的数据库,以防它没有完全符合您的预期。