我有一个像这样的mongodb(2.6版)文档:
{
"Uniprot": {
"GO": [
"cytoplasm [GO:0005737]",
"nucleolus [GO:0005730]",
"calcium ion binding [GO:0005509]",
"zinc ion binding [GO:0008270]"
],
"GO cc": [
"cytoplasm [GO:0005737]",
"nucleolus [GO:0005730]"
],
"GO bp": [
""
],
"GO mf": [
"calcium ion binding [GO:0005509]",
"zinc ion binding [GO:0008270]"
],
"GO_ID": [
"GO:0005509",
"GO:0005737",
"GO:0005730",
"GO:0008270"
]
}
}
使用以下代码,其中test_3是前一个文档的集合:
project = {"$project":{"_id": False,
"Uniprot": "$Uniprot.Uniprot",
"GO Description": {
"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO cc"]},
"$Uniprot.GO cc",
{"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO mf"]},
"$Uniprot.GO mf",
{"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO bp"]},
"$Uniprot.GO bp", False]}]}]},
"GO Type": {"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO cc"]},
"Cellular component",
{"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO mf"]},
"Molecular function",
{"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO bp"]},
"Biological Process", False]}]}]},
"GO ID":"$Uniprot.GO_ID"}}
redact = {"$redact":{"$cond":[{"$or":[
{"$eq":["$Uniprot.GO", "$Uniprot.GO cc"]},
{"$eq":["$Uniprot.GO", "$Uniprot.GO mf"]},
{"$eq":["$Uniprot.GO", "$Uniprot.GO bp"]}]},
"$$KEEP", "$$PRUNE"]}}
d = test_3.aggregate([{"$match":{"Uniprot.Uniprot": "P33764"}},
{"$unwind":"$Uniprot.GO"},
{"$unwind":"$Uniprot.GO cc"},
{"$unwind":"$Uniprot.GO bp"},
{"$unwind":"$Uniprot.GO mf"},
redact,
{"$unwind":"$Uniprot.GO_ID"},
project
])
输出结果为:
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
现在我使用以下方法修改输出:
b = []
for i in d:
go = i["GO Description"][-13:]
if i["GO ID"] == go[2:-1]:
entry = i.copy()
entry["GO Description"] = i["GO Description"][:-13]
if entry not in b:
b.append(entry)
for i in b:
print i
获得我预期的输出:
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding', u'Uniprot': u'P33764'}
然而,这种方式很慢,我想在mongodb中进行,而无需在python中进一步处理。我该怎么做?
我注意到的事情:由于GO和GO_ID的展开,聚合会产生重复的行(这就是为什么有if entry not in b
),我需要检查GO ID
上是否有GO Description
,因为我无法通过$search
或$text
找到解决方法。