从嵌套数组中检索匹配的文档

时间:2018-06-12 11:49:48

标签: mongodb aggregate

我试图收集字段unknown等于{ "_id" :"5b1e73786f11e421956023c3", "subs" : [ { "name" : "subrepo1", "files" : [ { "name" : ".....", "spec" : "Unknown" }, { "name" : ".....", "spec" : "Unknown" } ] }, { "name" : "subrepo2", "files" : [ { "name" : "file2", "spec" : "Unknown" }, { "name" : ".....", "spec" : "1234" } ] } ] } 的嵌套数组中的所有对象。

每个文档的结构与此类似:

db.col.aggregate([
    {$match: {'subs.files.spec': 'Unknown'}},
    {$project: {
        'subs.files': {$filter: {
            input: '$subs.files',
            //as: 'subs.files',
            cond: {$eq: ['this.spec', 'FunSuite']}
        }},
        //_id: 0
    }}
])

我尝试了以下但是它不起作用。我认为这是正确的方向,但我可能错过了一些重要的事情。

{
    "_id" : "5b1e73786f11e421956023c3",
    "subs" : [ 
        {
            "name" : "subrepo1",
            "files" : [ 
                {
                    "name" : ".....",
                    "spec" : "Unknown"
                }, 
                {
                    "name" : ".....",
                    "spec" : "Unknown"
                }
            ]
        },
        {
            "name" : "subrepo2",
            "files" : [ 
                {
                    "name" : "file2",
                    "spec" : "Unknown"
                }
            ]
        }
    ]
}

预期的输出是:(所以只有那些规格等于未知的文件(不是其他的)

#import the library used to query a website
import urllib2
from bs4 import BeautifulSoup
import pandas as pd
from multiprocessing import Pool, cpu_count
from datetime import datetime

t1 = datetime.now()

hmdb_link = "http://www.hmdb.ca/metabolites"
page_link = 'http://www.hmdb.ca/metabolites?c=hmdb_id&d=up&page=1'

plinks = []


for i in range(1, 4562):
  url = 'http://www.hmdb.ca/metabolites?c=hmdb_id&d=up&page=' + str(i)
  plinks.append(url)


def scrape_hmdb(link):
  a = []
  b = []
  c = []
  d = []
  e = []
  f = []
  g = []
  h = []
  i = []
  j = []
  k = []
  l = []
  m = []

# for link in plinks:
# print links
page = urllib2.urlopen(link).read()
soup = BeautifulSoup(page)
table = soup.find('table', {'class': "table table-striped table-condensed table-hover metabolites"})
for row in table.findAll("tr"):
    cells = row.findAll('td')
    for cell in cells:
        #         print cell
        links = cell.find('a', {'class': 'btn-card'})
        if links is not None:
            metab_link = hmdb_link + '/' + cell.a.string
            print link, metab_link
            m.append(metab_link)
            metab_page = urllib2.urlopen(metab_link).read()
            metab_soup = BeautifulSoup(metab_page)
            hmdb_table = metab_soup.find('table', {'class': 'content-table table table-condensed table-bordered'})

            for row in hmdb_table.findAll("tr"):
                cells = row.findAll('td')
                states = row.findAll('th')  # To store second column data
                if not (cells == [] or states == []):
                    #                     print states
                    col_name = str(states[0].find(text=True))
                    value = cells[0].find(text=True)
                    if col_name == 'HMDB ID':
                        a.append(value)
                    #                     print a
                    elif col_name == 'Secondary Accession Numbers':
                        san = [cell.text for cell in cells]
                        y = san[0].split('\n')
                        #                         text = san.replace('\n', ', ')
                        #                         ", ".join(san.splitlines())
                        b.append(y)
                    elif col_name == 'Common Name':
                        c.append(value)
                    elif col_name == 'Chemical Formula':
                        chem_form = [cell.text for cell in cells]
                        d.append(''.join(str(e) for e in chem_form))
                    elif col_name == 'Average Molecular Weight':
                        e.append(value)
                    elif col_name == 'DrugBank ID':
                        f.append(value)
                    elif col_name == 'FoodDB ID':
                        g.append(value)
                    elif col_name == 'Chemspider ID':
                        h.append(value)
                    elif col_name == 'KEGG Compound ID':
                        i.append(value)
                    elif col_name == 'BioCyc ID':
                        j.append(value)
                    elif col_name == 'METLIN ID':
                        k.append(value)
                    elif col_name == 'PubChem Compound':
                        l.append(value)


df = pd.DataFrame()
df['HMDB ID'] = a
df['KEGG Compound ID'] = i
df['Secondary Accession Numbers'] = b
df['Common Name'] = c
df['Chemical Formula'] = d
df['Average Molecular Weight'] = e
df['DrugBank ID'] = f
df['FoodDB ID'] = g
df['Chemspider ID'] = h
df['BioCyc ID'] = j
df['METLIN ID'] = k
df['PubChem Compound'] = l
df['HMDB URL'] = m
return df

def row_val_split(df, column, sep=',', keep=False):
"""
This function would check if any row of a particular column contains values 
seperated by comma(,).
In that case, would create additional rows for comma-seperated values, 
keeping values of other columns constant
"""
indexes = list()
new_values = list()
df = df.dropna(subset=[column])
for i, presplit in enumerate(df[column].astype(str)):
    presplit = presplit[1:-1]
    values = presplit.split(sep)
    if keep and len(values) > 1:
        indexes.append(i)
        new_values.append(str(presplit))
    for value in values:
        indexes.append(i)
        new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df

df = pd.DataFrame()


if __name__ == '__main__':
  p = Pool(cpu_count()*2)  # Pool tells how many at a time
  records = p.map(scrape_hmdb, plinks)
  df = pd.concat(records)
  p.terminate()
  p.join()

  print "Length of DF before spitting the rows: {}".format(len(df))

  df = row_val_split(df, 'Secondary Accession Numbers')
  df.to_csv('HMDB_scrapping.csv')
  print "length of DF after splitting the rows: {}".format(len(df))
  t2 = datetime.now()
  t = t2-t1
  print "Total Time Taken: {}".format(t)

2 个答案:

答案 0 :(得分:1)

您需要使用$filter aggregation运算符,该运算符仅提供数组中匹配的元素并转义其他元素

db.collection.aggregate([
  {
    $unwind: "$subs"
  },
  {
    $project: {
      "subs.name": "$subs.name",
      "subs.files": {
        $filter: {
          input: "$subs.files",
          as: "file",
          cond: {
            $eq: [
              "$$file.spec",
              "Unknown"
            ]
          }
        }
      }
    }
  },
  {
    $group: {
      _id: "$_id",
      subs: {
        $push: "$subs"
      }
    }
  }
])

以上将给出以下输出

[
  {
    "_id": ObjectId("5a934e000102030405000000"),
    "subs": [
      {
        "files": [
          {
            "name": ".....",
            "spec": "Unknown"
          },
          {
            "name": ".....",
            "spec": "Unknown"
          }
        ],
        "name": "subrepo1"
      },
      {
        "files": [
          {
            "name": "file2",
            "spec": "Unknown"
          }
        ],
        "name": "subrepo2"
      }
    ]
  }
]

您可以查看结果here

如果您想以数组形式获取字段,请从管道中删除$unwind$replaceRoot阶段

db.collection.aggregate([
  {
    $unwind: "$subs"
  },
  {
    $project: {
      "subs.name": "$subs.name",
      "subs.files": {
        $filter: {
          input: "$subs.files",
          as: "file",
          cond: {
            $eq: [
              "$$file.spec",
              "Unknown"
            ]
          }
        }
      }
    }
  },
  {
    $unwind: "$subs.files"
  },
  {
    $replaceRoot: {
      newRoot: "$subs.files"
    }
  }
])

以上将给出以下输出

[
  {
    "name": ".....",
    "spec": "Unknown"
  },
  {
    "name": ".....",
    "spec": "Unknown"
  },
  {
    "name": "file2",
    "spec": "Unknown"
  }
]

答案 1 :(得分:0)

尝试这种方式:

db.col.aggregate([
    {
        $unwind: '$subs'
    },
    {
        $unwind: '$subs.files'
    },
    {
        $match: {
            'subs.files.spec': 'Unknown'
        }
    }
]);