我只想复制一个集合,但删除多余的条目。我可以做到这一点的一种方法当然是导出整个集合,并将其中一个字段更改为_id。此外,我可以复制集合并索引字段以删除冗余,但有太多的方法。
但是,有更优雅的解决方案吗?也许我可以做这样的事情。
db.coll.distinct('Query Sequence').forEach(
function(x){
db.newcollection.insert(db.coll.findOne({'Query Sequence':x}))})
当然这不起作用......但是有没有人有这样的解决方案?
编辑 - 它不起作用的原因是因为distinct数组也很大。
编辑2 - 这就是它的样子。
db.coll.find({'查询序列':'ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG'})
{
"_id" : ObjectId("5424b996ce5254437868c1c9"),
"Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_2557_16557_7",
"Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG",
"Chain type" : "VH",
"Format Type" : "imgt",
"Species" : "human",
"Top V Hit" : "IGHV1-2*01",
"Top D Hit" : "N/A",
"Top J Hit" : "IGHJ4*01",
"Productive" : "Yes",
"Productive CDR3" : "True",
"Strand" : "-",
"Framework 1 Nucleotides" : "GGTTGGGGCGGATGCACTCCCCAGTACATAT",
"Framework 2 Nucleotides" : "AAGCCTTGCA",
"Framework 4 Nucleotides" : "TGCAG",
"CDR1 Nucleotides" : "AGTAGCCGGTGAAGGTGTATCCAG",
"CDR3 Nucleotides" : "CGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTG",
"Framework 1 AA" : "GWGGCTPQYI",
"Framework 2 AA" : "KPC",
"Framework 4 AA" : "C",
"Framework 1 AA Length" : 10,
"Framework 2 AA Length" : 3,
"Framework 4 AA Length" : 1,
"CDR1 AA" : "SSR*RCIQ",
"CDR3 AA" : "RLGRMHSPVHIVAGEGVSRSL",
"CDR1 AA Length" : 8,
"CDR3 AA Length" : 21,
"Total V Alignment Matches" : 64,
"Total V Alignment Mismatches" : 1,
"Total V Alignment Length" : 65,
"Total V Alignment Gaps" : 0,
"Total V Alignment Identity" : 98.5,
"FW1 Alignment From" : 7,
"FW1 Alignment To" : 37,
"FW1 Alignment Matches" : 31,
"FW1 Alignment Mismatches" : 0,
"FW1 Alignment Length" : 31,
"FW1 Alignment Gaps" : 0,
"FW1 Alignment Identity" : 100,
"FW2 Alignment From" : 62,
"FW2 Alignment To" : 71,
"FW2 Alignment Matches" : 9,
"FW2 Alignment Mismatches" : 1,
"FW2 Alignment Length" : 10,
"FW2 Alignment Gaps" : 0,
"FW2 Alignment Identity" : 90,
"CDR1 Alignment From" : 38,
"CDR1 Alignment To" : 61,
"CDR1 Alignment Matches" : 24,
"CDR1 Alignment Mismatches" : 0,
"CDR1 Alignment Length" : 24,
"CDR1 Alignment Gaps" : 0,
"CDR1 Alignment Identity" : 100,
"Junction V-End" : "CTGGG",
"V-D Junction" : "N/A",
"Junction D-Gene" : "N/A",
"D-J Junction" : "N/A",
"Junction J-Start" : "G",
"Junction Merged" : "CTGGGG",
"Stop Codon" : "No",
"V-J frame" : "In-frame",
}
{
"_id" : ObjectId("5424b996ce52544378867c128"),
"Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_35567_85D",
"Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG",
"Chain type" : "VH",
"Format Type" : "imgt",
"Species" : "human",
"Top V Hit" : "IGHV1-2*01",
"Top D Hit" : "N/A",
"Top J Hit" : "IGHJ4*01",
"Productive" : "Yes",
"Productive CDR3" : "True",
"Strand" : "-",
"Framework 1 Nucleotides" : "GGTTGGGGCGGATGCACTCCCCAGTACATAT",
"Framework 2 Nucleotides" : "AAGCCTTGCA",
"Framework 4 Nucleotides" : "TGCAG",
"CDR1 Nucleotides" : "AGTAGCCGGTGAAGGTGTATCCAG",
"CDR3 Nucleotides" : "CGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTG",
"Framework 1 AA" : "GWGGCTPQYI",
"Framework 2 AA" : "KPC",
"Framework 4 AA" : "C",
"Framework 1 AA Length" : 10,
"Framework 2 AA Length" : 3,
"Framework 4 AA Length" : 1,
"CDR1 AA" : "SSR*RCIQ",
"CDR3 AA" : "RLGRMHSPVHIVAGEGVSRSL",
"CDR1 AA Length" : 8,
"CDR3 AA Length" : 21,
"Total V Alignment Matches" : 64,
"Total V Alignment Mismatches" : 1,
"Total V Alignment Length" : 65,
"Total V Alignment Gaps" : 0,
"Total V Alignment Identity" : 98.5,
"FW1 Alignment From" : 7,
"FW1 Alignment To" : 37,
"FW1 Alignment Matches" : 31,
"FW1 Alignment Mismatches" : 0,
"FW1 Alignment Length" : 31,
"FW1 Alignment Gaps" : 0,
"FW1 Alignment Identity" : 100,
"FW2 Alignment From" : 62,
"FW2 Alignment To" : 71,
"FW2 Alignment Matches" : 9,
"FW2 Alignment Mismatches" : 1,
"FW2 Alignment Length" : 10,
"FW2 Alignment Gaps" : 0,
"FW2 Alignment Identity" : 90,
"CDR1 Alignment From" : 38,
"CDR1 Alignment To" : 61,
"CDR1 Alignment Matches" : 24,
"CDR1 Alignment Mismatches" : 0,
"CDR1 Alignment Length" : 24,
"CDR1 Alignment Gaps" : 0,
"CDR1 Alignment Identity" : 100,
"Junction V-End" : "CTGGG",
"V-D Junction" : "N/A",
"Junction D-Gene" : "N/A",
"D-J Junction" : "N/A",
"Junction J-Start" : "G",
"Junction Merged" : "CTGGGG",
"Stop Codon" : "No",
"V-J frame" : "In-frame",
}
如您所见,除了ObjectId和Sequence Id之外,一切都是相同的。我只想要新系列中的一个文件。我正在使用Mongo 2.6.4
答案 0 :(得分:1)
我建议您尝试聚合框架。 以下mongo shell程序显示了如何执行此操作, 以及一些其他简化示例文档,以获得更完整的说明。 请注意"序列ID"。
的某些文档中的第一个字符更改管道阶段是:
这假定您的文档都具有相同的字段。
对于任何不那么规律
您将不得不通过客户端程序往返数据 -
消除管道中的$ project和$ out,
客户端程序中的批处理,
并将doc
字段手动投影到顶层。
您需要额外的磁盘空间才能执行此操作。 允许至少2x, 运行聚合框架时,临时空间为1x, 和1x用于新的收集结果。
有关文档,请参阅http://docs.mongodb.org/manual/core/aggregation-pipeline/
希望这会有所帮助。
聚合性基团的第一last.js:
var docs = [
{"Sequence Id":"M02331_41_000000000_AAW8D_1_1108_2557_16557_7","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG","ChainType":"VH"},
{"Sequence Id":"M02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG","ChainType":"VH"},
{"Sequence Id":"B02331_41_000000000_AAW8D_1_1108_2557_16557_7","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG","ChainType":"VH"},
{"Sequence Id":"A02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG","ChainType":"VH"},
{"Sequence Id":"C02331_41_000000000_AAW8D_1_1108_35567_85D","Query Sequence":"ATCTACGGTTGGGGCGGAT","ChainType":"VH"}
];
db.test.remove({});
db.test.save(docs);
var result = db.test.find().toArray();
var keys = Object.keys(result[0]);
var project = {};
for (i = 0; i < keys.length; i++) {
project[keys[i]] = "$doc." + keys[i];
}
printjson(project);
function pipelineWithOut(out) {
pipeline = [
{"$sort": {"Sequence Id": -1}},
{"$group": {_id: "$Query Sequence", doc: {"$first": "$$ROOT"}}},
{"$project": project},
{"$out": out}
];
printjson(pipeline);
return pipeline;
}
db.testFirst.drop();
db.test.aggregate(pipelineWithOut("testFirst"), {allowDiskUse: true});
printjson(db.testFirst.find().toArray());
$ mongo aggregate-group-first-last.js
MongoDB shell version: 2.6.4
connecting to: test
{
"_id" : "$doc._id",
"Sequence Id" : "$doc.Sequence Id",
"Query Sequence" : "$doc.Query Sequence",
"ChainType" : "$doc.ChainType"
}
[
{
"$sort" : {
"Sequence Id" : -1
}
},
{
"$group" : {
"_id" : "$Query Sequence",
"doc" : {
"$first" : "$$ROOT"
}
}
},
{
"$project" : {
"_id" : "$doc._id",
"Sequence Id" : "$doc.Sequence Id",
"Query Sequence" : "$doc.Query Sequence",
"ChainType" : "$doc.ChainType"
}
},
{
"$out" : "testFirst"
}
]
[
{
"_id" : ObjectId("54299b557d7122b60724e5f5"),
"Sequence Id" : "B02331_41_000000000_AAW8D_1_1108_2557_16557_7",
"Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAG",
"ChainType" : "VH"
},
{
"_id" : ObjectId("54299b557d7122b60724e5f7"),
"Sequence Id" : "C02331_41_000000000_AAW8D_1_1108_35567_85D",
"Query Sequence" : "ATCTACGGTTGGGGCGGAT",
"ChainType" : "VH"
},
{
"_id" : ObjectId("54299b557d7122b60724e5f4"),
"Sequence Id" : "M02331_41_000000000_AAW8D_1_1108_35567_85D",
"Query Sequence" : "ATCTACGGTTGGGGCGGATGCACTCCCCAGTACATATAGTAGCCGGTGAAGGTGTATCCAGAAGCCTTGCAGGAGACCTTCACTGAGGCCGAAGAG",
"ChainType" : "VH"
}
]