我试图收集字段unknown
等于{
"_id" :"5b1e73786f11e421956023c3",
"subs" : [
{
"name" : "subrepo1",
"files" : [
{
"name" : ".....",
"spec" : "Unknown"
},
{
"name" : ".....",
"spec" : "Unknown"
}
]
},
{
"name" : "subrepo2",
"files" : [
{
"name" : "file2",
"spec" : "Unknown"
},
{
"name" : ".....",
"spec" : "1234"
}
]
}
]
}
的嵌套数组中的所有对象。
每个文档的结构与此类似:
db.col.aggregate([
{$match: {'subs.files.spec': 'Unknown'}},
{$project: {
'subs.files': {$filter: {
input: '$subs.files',
//as: 'subs.files',
cond: {$eq: ['this.spec', 'FunSuite']}
}},
//_id: 0
}}
])
我尝试了以下但是它不起作用。我认为这是正确的方向,但我可能错过了一些重要的事情。
{
"_id" : "5b1e73786f11e421956023c3",
"subs" : [
{
"name" : "subrepo1",
"files" : [
{
"name" : ".....",
"spec" : "Unknown"
},
{
"name" : ".....",
"spec" : "Unknown"
}
]
},
{
"name" : "subrepo2",
"files" : [
{
"name" : "file2",
"spec" : "Unknown"
}
]
}
]
}
预期的输出是:(所以只有那些规格等于未知的文件(不是其他的)
#import the library used to query a website
import urllib2
from bs4 import BeautifulSoup
import pandas as pd
from multiprocessing import Pool, cpu_count
from datetime import datetime
t1 = datetime.now()
hmdb_link = "http://www.hmdb.ca/metabolites"
page_link = 'http://www.hmdb.ca/metabolites?c=hmdb_id&d=up&page=1'
plinks = []
for i in range(1, 4562):
url = 'http://www.hmdb.ca/metabolites?c=hmdb_id&d=up&page=' + str(i)
plinks.append(url)
def scrape_hmdb(link):
a = []
b = []
c = []
d = []
e = []
f = []
g = []
h = []
i = []
j = []
k = []
l = []
m = []
# for link in plinks:
# print links
page = urllib2.urlopen(link).read()
soup = BeautifulSoup(page)
table = soup.find('table', {'class': "table table-striped table-condensed table-hover metabolites"})
for row in table.findAll("tr"):
cells = row.findAll('td')
for cell in cells:
# print cell
links = cell.find('a', {'class': 'btn-card'})
if links is not None:
metab_link = hmdb_link + '/' + cell.a.string
print link, metab_link
m.append(metab_link)
metab_page = urllib2.urlopen(metab_link).read()
metab_soup = BeautifulSoup(metab_page)
hmdb_table = metab_soup.find('table', {'class': 'content-table table table-condensed table-bordered'})
for row in hmdb_table.findAll("tr"):
cells = row.findAll('td')
states = row.findAll('th') # To store second column data
if not (cells == [] or states == []):
# print states
col_name = str(states[0].find(text=True))
value = cells[0].find(text=True)
if col_name == 'HMDB ID':
a.append(value)
# print a
elif col_name == 'Secondary Accession Numbers':
san = [cell.text for cell in cells]
y = san[0].split('\n')
# text = san.replace('\n', ', ')
# ", ".join(san.splitlines())
b.append(y)
elif col_name == 'Common Name':
c.append(value)
elif col_name == 'Chemical Formula':
chem_form = [cell.text for cell in cells]
d.append(''.join(str(e) for e in chem_form))
elif col_name == 'Average Molecular Weight':
e.append(value)
elif col_name == 'DrugBank ID':
f.append(value)
elif col_name == 'FoodDB ID':
g.append(value)
elif col_name == 'Chemspider ID':
h.append(value)
elif col_name == 'KEGG Compound ID':
i.append(value)
elif col_name == 'BioCyc ID':
j.append(value)
elif col_name == 'METLIN ID':
k.append(value)
elif col_name == 'PubChem Compound':
l.append(value)
df = pd.DataFrame()
df['HMDB ID'] = a
df['KEGG Compound ID'] = i
df['Secondary Accession Numbers'] = b
df['Common Name'] = c
df['Chemical Formula'] = d
df['Average Molecular Weight'] = e
df['DrugBank ID'] = f
df['FoodDB ID'] = g
df['Chemspider ID'] = h
df['BioCyc ID'] = j
df['METLIN ID'] = k
df['PubChem Compound'] = l
df['HMDB URL'] = m
return df
def row_val_split(df, column, sep=',', keep=False):
"""
This function would check if any row of a particular column contains values
seperated by comma(,).
In that case, would create additional rows for comma-seperated values,
keeping values of other columns constant
"""
indexes = list()
new_values = list()
df = df.dropna(subset=[column])
for i, presplit in enumerate(df[column].astype(str)):
presplit = presplit[1:-1]
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(str(presplit))
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
df = pd.DataFrame()
if __name__ == '__main__':
p = Pool(cpu_count()*2) # Pool tells how many at a time
records = p.map(scrape_hmdb, plinks)
df = pd.concat(records)
p.terminate()
p.join()
print "Length of DF before spitting the rows: {}".format(len(df))
df = row_val_split(df, 'Secondary Accession Numbers')
df.to_csv('HMDB_scrapping.csv')
print "length of DF after splitting the rows: {}".format(len(df))
t2 = datetime.now()
t = t2-t1
print "Total Time Taken: {}".format(t)
答案 0 :(得分:1)
您需要使用$filter aggregation运算符,该运算符仅提供数组中匹配的元素并转义其他元素
db.collection.aggregate([
{
$unwind: "$subs"
},
{
$project: {
"subs.name": "$subs.name",
"subs.files": {
$filter: {
input: "$subs.files",
as: "file",
cond: {
$eq: [
"$$file.spec",
"Unknown"
]
}
}
}
}
},
{
$group: {
_id: "$_id",
subs: {
$push: "$subs"
}
}
}
])
以上将给出以下输出
[
{
"_id": ObjectId("5a934e000102030405000000"),
"subs": [
{
"files": [
{
"name": ".....",
"spec": "Unknown"
},
{
"name": ".....",
"spec": "Unknown"
}
],
"name": "subrepo1"
},
{
"files": [
{
"name": "file2",
"spec": "Unknown"
}
],
"name": "subrepo2"
}
]
}
]
您可以查看结果here
如果您想以数组形式获取字段,请从管道中删除$unwind和$replaceRoot阶段
db.collection.aggregate([
{
$unwind: "$subs"
},
{
$project: {
"subs.name": "$subs.name",
"subs.files": {
$filter: {
input: "$subs.files",
as: "file",
cond: {
$eq: [
"$$file.spec",
"Unknown"
]
}
}
}
}
},
{
$unwind: "$subs.files"
},
{
$replaceRoot: {
newRoot: "$subs.files"
}
}
])
以上将给出以下输出
[
{
"name": ".....",
"spec": "Unknown"
},
{
"name": ".....",
"spec": "Unknown"
},
{
"name": "file2",
"spec": "Unknown"
}
]
答案 1 :(得分:0)
尝试这种方式:
db.col.aggregate([
{
$unwind: '$subs'
},
{
$unwind: '$subs.files'
},
{
$match: {
'subs.files.spec': 'Unknown'
}
}
]);