我试图在pymongo连接器中插入我的查询对象:
import pandas as pd
from pymongo import MongoClient
def _connect_mongo(host, port, username, password, db):
if username and password:
mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
conn = MongoClient(mongo_uri)
else:
conn = MongoClient(host, port)
return conn[db]
def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
""" Read from Mongo and Store into DataFrame """
# Connect to MongoDB
db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)
# Make a query to the specific DB and Collection
cursor = db[collection].find(query)
# Expand the cursor and construct the DataFrame
df = pd.DataFrame(list(cursor))
# Delete the _id
# if no_id:
# del df['_id']
return df
我的查询定义为:
query_1 = "{
"status" : {"$ne" : "deprecated"},
"geoLocationData.date" : { $gte : new ISODate("2016-08-03") }
},
{ "geoLocationData.date": 1,
"geoLocationData.iso": 1,
"httpRequestData.ipAddress": 1,
"geoLocationData.city": 1,
"geoLocationData.latitude": 1,
"geoLocationData.longitude": 1 }"
将其插入 - 获取pandas数据帧:
df = read_mongo(db, collection, query_1, host, port, username, password)
我收到错误:
TypeError: filter must be an instance of dict, bson.son.SON, or other type that inherits from collections.Mapping
如果我只省略子文档,查询就可以了,我可以将其转换为数据帧。
我想它是关于将我的查询转换为字典(包含子文档)。 我怎么能这样做?
答案 0 :(得分:0)
您的query_1
变量是一个似乎包含两个词典的字符串。 find
方法首先将字典作为过滤器参数,然后将投影作为第二个参数。你只传递了一个论点。
这应该有效:
def read_mongo(db, collection, filter={}, projection={}, host='localhost', port=27017, username=None, password=None, no_id=True):
""" Read from Mongo and Store into DataFrame """
# Connect to MongoDB
db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)
# Make a query to the specific DB and Collection
cursor = db[collection].find(filter, projection)
# Expand the cursor and construct the DataFrame
df = pd.DataFrame(list(cursor))
# Delete the _id
# if no_id:
# del df['_id']
return df
query_filter = {
"status" : {"$ne" : "deprecated"},
"geoLocationData.date" : { $gte : new ISODate("2016-08-03") }
}
query_project = { "geoLocationData.date": 1,
"geoLocationData.iso": 1,
"httpRequestData.ipAddress": 1,
"geoLocationData.city": 1,
"geoLocationData.latitude": 1,
"geoLocationData.longitude": 1 }
df = read_mongo(db, collection, query_filter, query_project, host, port, username, password)
但是,我并不认为这种方法可以完全符合您的要求。这是因为find
操作中的投影只能包含或排除它不以您希望的方式映射它们的字段。您可以迭代光标以在传递到DataFrame
构造函数之前操作数据(注意不要创建巨大的python列表)。更好的方法是不使用find
,而是使用aggregate
:
cursor = db[collection].aggregate([filter, projection])
现在,您可以将过滤器设为match
管道阶段,将投影设为project
阶段。
query_project = { "geoDate":"$geoLocationData.date",
"geoLoc":"$geoLocationData.iso",
"ipAddress": "$httpRequestData.ipAddress",
"city": "$geoLocationData.city",
"lat": "$geoLocationData.latitude",
"long":"$geoLocationData.longitude"}
df = read_mongo(db, collection, {"$match" : query_filter}, {"$project" : query_project}, host, port, username, password)