使用删除的冗余创建新集合

时间:2014-09-29 02:55:13

标签: mongodb mongodb-query

我只想复制一个集合,但删除多余的条目。我可以做到这一点的一种方法当然是导出整个集合,并将其中一个字段更改为_id。此外,我可以复制集合并索引字段以删除冗余,但有太多的方法。

但是,有更优雅的解决方案吗?也许我可以做这样的事情。

db.coll.distinct('Query Sequence').forEach(
         function(x){
            db.newcollection.insert(db.coll.findOne({'Query Sequence':x}))})

当然这不起作用......但是有没有人有这样的解决方案?

编辑 - 它不起作用的原因是因为distinct数组也很大。

编辑2 - 这就是它的样子。

db.coll.find({'查询序列':'ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG'})

{
    "_id" : ObjectId("5424b996ce5254437868c1c9"),
    "Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_2557_16557_7",
    "Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG",
    "Chain type" : "VH",
    "Format Type" : "imgt",
    "Species" : "human",
    "Top V Hit" : "IGHV1-2*01",
    "Top D Hit" : "N/A",
    "Top J Hit" : "IGHJ4*01",
    "Productive" : "Yes",
    "Productive CDR3" : "True",
    "Strand" : "-",
    "Framework 1 Nucleotides" : "GGTTGGGGCGGATGCACTCCCCAGTACATAT",
    "Framework 2 Nucleotides" : "AAGCCTTGCA",
    "Framework 4 Nucleotides" : "TGCAG",
    "CDR1 Nucleotides" : "AGTAGCCGGTGAAGGTGTATCCAG",
    "CDR3 Nucleotides" : "CGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTG",
    "Framework 1 AA" : "GWGGCTPQYI",
    "Framework 2 AA" : "KPC",
    "Framework 4 AA" : "C",
    "Framework 1 AA Length" : 10,
    "Framework 2 AA Length" : 3,
    "Framework 4 AA Length" : 1,
    "CDR1 AA" : "SSR*RCIQ",
    "CDR3 AA" : "RLGRMHSPVHIVAGEGVSRSL",
    "CDR1 AA Length" : 8,
    "CDR3 AA Length" : 21,
    "Total V Alignment Matches" : 64,
    "Total V Alignment Mismatches" : 1,
    "Total V Alignment Length" : 65,
    "Total V Alignment Gaps" : 0,
    "Total V Alignment Identity" : 98.5,
    "FW1 Alignment From" : 7,
    "FW1 Alignment To" : 37,
    "FW1 Alignment Matches" : 31,
    "FW1 Alignment Mismatches" : 0,
    "FW1 Alignment Length" : 31,
    "FW1 Alignment Gaps" : 0,
    "FW1 Alignment Identity" : 100,
    "FW2 Alignment From" : 62,
    "FW2 Alignment To" : 71,
    "FW2 Alignment Matches" : 9,
    "FW2 Alignment Mismatches" : 1,
    "FW2 Alignment Length" : 10,
    "FW2 Alignment Gaps" : 0,
    "FW2 Alignment Identity" : 90,
    "CDR1 Alignment From" : 38,
    "CDR1 Alignment To" : 61,
    "CDR1 Alignment Matches" : 24,
    "CDR1 Alignment Mismatches" : 0,
    "CDR1 Alignment Length" : 24,
    "CDR1 Alignment Gaps" : 0,
    "CDR1 Alignment Identity" : 100,
    "Junction V-End" : "CTGGG",
    "V-D Junction" : "N/A",
    "Junction D-Gene" : "N/A",
    "D-J Junction" : "N/A",
    "Junction J-Start" : "G",
    "Junction Merged" : "CTGGGG",
    "Stop Codon" : "No",
    "V-J frame" : "In-frame",
}

{
    "_id" : ObjectId("5424b996ce52544378867c128"),
    "Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_35567_85D",
    "Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG",
    "Chain type" : "VH",
    "Format Type" : "imgt",
    "Species" : "human",
    "Top V Hit" : "IGHV1-2*01",
    "Top D Hit" : "N/A",
    "Top J Hit" : "IGHJ4*01",
    "Productive" : "Yes",
    "Productive CDR3" : "True",
    "Strand" : "-",
    "Framework 1 Nucleotides" : "GGTTGGGGCGGATGCACTCCCCAGTACATAT",
    "Framework 2 Nucleotides" : "AAGCCTTGCA",
    "Framework 4 Nucleotides" : "TGCAG",
    "CDR1 Nucleotides" : "AGTAGCCGGTGAAGGTGTATCCAG",
    "CDR3 Nucleotides" : "CGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTG",
    "Framework 1 AA" : "GWGGCTPQYI",
    "Framework 2 AA" : "KPC",
    "Framework 4 AA" : "C",
    "Framework 1 AA Length" : 10,
    "Framework 2 AA Length" : 3,
    "Framework 4 AA Length" : 1,
    "CDR1 AA" : "SSR*RCIQ",
    "CDR3 AA" : "RLGRMHSPVHIVAGEGVSRSL",
    "CDR1 AA Length" : 8,
    "CDR3 AA Length" : 21,
    "Total V Alignment Matches" : 64,
    "Total V Alignment Mismatches" : 1,
    "Total V Alignment Length" : 65,
    "Total V Alignment Gaps" : 0,
    "Total V Alignment Identity" : 98.5,
    "FW1 Alignment From" : 7,
    "FW1 Alignment To" : 37,
    "FW1 Alignment Matches" : 31,
    "FW1 Alignment Mismatches" : 0,
    "FW1 Alignment Length" : 31,
    "FW1 Alignment Gaps" : 0,
    "FW1 Alignment Identity" : 100,
    "FW2 Alignment From" : 62,
    "FW2 Alignment To" : 71,
    "FW2 Alignment Matches" : 9,
    "FW2 Alignment Mismatches" : 1,
    "FW2 Alignment Length" : 10,
    "FW2 Alignment Gaps" : 0,
    "FW2 Alignment Identity" : 90,
    "CDR1 Alignment From" : 38,
    "CDR1 Alignment To" : 61,
    "CDR1 Alignment Matches" : 24,
    "CDR1 Alignment Mismatches" : 0,
    "CDR1 Alignment Length" : 24,
    "CDR1 Alignment Gaps" : 0,
    "CDR1 Alignment Identity" : 100,
    "Junction V-End" : "CTGGG",
    "V-D Junction" : "N/A",
    "Junction D-Gene" : "N/A",
    "D-J Junction" : "N/A",
    "Junction J-Start" : "G",
    "Junction Merged" : "CTGGGG",
    "Stop Codon" : "No",
    "V-J frame" : "In-frame",
}

如您所见,除了ObjectId和Sequence Id之外,一切都是相同的。我只想要新系列中的一个文件。我正在使用Mongo 2.6.4

1 个答案:

答案 0 :(得分:1)

我建议您尝试聚合框架。 以下mongo shell程序显示了如何执行此操作, 以及一些其他简化示例文档,以获得更完整的说明。 请注意"序列ID"。

的某些文档中的第一个字符更改

管道阶段是:

  1. $ sort降序排序更高"序列ID"第一
  2. $ group by"查询序列"并累积第一个doc(每个组中有更高的" Sequence Id")
  3. $项目字段返回顶级
  4. $ out以保存到新集合
  5. 这假定您的文档都具有相同的字段。 对于任何不那么规律 您将不得不通过客户端程序往返数据 - 消除管道中的$ project和$ out, 客户端程序中的批处理, 并将doc字段手动投影到顶层。

    您需要额外的磁盘空间才能执行此操作。 允许至少2x, 运行聚合框架时,临时空间为1x, 和1x用于新的收集结果。

    有关文档,请参阅http://docs.mongodb.org/manual/core/aggregation-pipeline/

    希望这会有所帮助。

    聚合性基团的第一last.js:

    var docs = [
        {"Sequence Id":"M02331_41_000000000_AAW8D_1_1108_2557_16557_7","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG","ChainType":"VH"},
        {"Sequence Id":"M02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG","ChainType":"VH"},
        {"Sequence Id":"B02331_41_000000000_AAW8D_1_1108_2557_16557_7","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG","ChainType":"VH"},
        {"Sequence Id":"A02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG","ChainType":"VH"},
        {"Sequence Id":"C02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGAT","ChainType":"VH"}
    ];
    db.test.remove({});
    db.test.save(docs);
    var result = db.test.find().toArray();
    var keys = Object.keys(result[0]);
    var project = {};
    for (i = 0; i < keys.length; i++) {
        project[keys[i]] = "$doc." + keys[i];
    }
    printjson(project);
    function pipelineWithOut(out) {
        pipeline = [
            {"$sort": {"Sequence Id": -1}},
            {"$group": {_id: "$Query Sequence", doc: {"$first": "$$ROOT"}}},
            {"$project": project},
            {"$out": out}
        ];
        printjson(pipeline);
        return pipeline;
    }
    db.testFirst.drop();
    db.test.aggregate(pipelineWithOut("testFirst"), {allowDiskUse: true});
    printjson(db.testFirst.find().toArray());
    

    $ mongo aggregate-group-first-last.js

    MongoDB shell version: 2.6.4
    connecting to: test
    {
        "_id" : "$doc._id",
        "Sequence Id" : "$doc.Sequence Id",
        "Query Sequence" : "$doc.Query Sequence",
        "ChainType" : "$doc.ChainType"
    }
    [
        {
            "$sort" : {
                "Sequence Id" : -1
            }
        },
        {
            "$group" : {
                "_id" : "$Query Sequence",
                "doc" : {
                    "$first" : "$$ROOT"
                }
            }
        },
        {
            "$project" : {
                "_id" : "$doc._id",
                "Sequence Id" : "$doc.Sequence Id",
                "Query Sequence" : "$doc.Query Sequence",
                "ChainType" : "$doc.ChainType"
            }
        },
        {
            "$out" : "testFirst"
        }
    ]
    [
        {
            "_id" : ObjectId("54299b557d7122b60724e5f5"),
            "Sequence Id" : "B02331_41_000000000_AAW8D_1_1108_2557_16557_7",
            "Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG",
            "ChainType" : "VH"
        },
        {
            "_id" : ObjectId("54299b557d7122b60724e5f7"),
            "Sequence Id" : "C02331_41_000000000_AAW8D_1_1108_35567_85D",
            "Query Sequence" : "ATCTACGGTTGGGGCGGAT",
            "ChainType" : "VH"
        },
        {
            "_id" : ObjectId("54299b557d7122b60724e5f4"),
            "Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_35567_85D",
            "Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG",
            "ChainType" : "VH"
        }
    ]