我正在将数据从mongo迁移到Postgres,但是遇到了性能瓶颈,需要解决。我对异步和协程是陌生的,但是从我的阅读中我相信它们可以为我提供帮助。我碰到了MotorClient,它支持asyncio,并敲掉了这段代码以进行测试。
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
import pandas as pd
from bson.json_util import dumps
from json import loads
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import JSONB
CLIENT = AsyncIOMotorClient('mongodb://<user>:<password>@<host>:27017')
DB = CLIENT.<mongo_database>
COLLECTIONS = ['person', 'booking']
ENG = create_engine('postgresql://<user>:<password>@<host>:5432/<database>')
@asyncio.coroutine
def process_data(collection):
cursor = DB[collection].find()
data = loads(dumps(cursor))
df = pd.DataFrame(data)
obj_cols = df.select_dtypes(include=[object]).columns.values.tolist()
df.to_sql(schema='mongo', name=collection, con=ENG, if_exists='replace', index=False,
dtype={c: JSONB for c in obj_cols})
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
tasks = [
process_data(x) for x in COLLECTIONS
]
loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
当前,我在以下代码行TypeError: Object of type 'AsyncIOMotorCursor' is not JSON serializable
上收到此错误data = loads(dumps(cursor))
。我相信这是因为MotorClient
游标对象不可迭代。
我在正确的轨道上,只有当sqlAlchemy和Pandas也支持asyncio时,这才真正起作用吗?
我不希望使用Pandas作为接口,并且还遇到了以下异步postgres库,因此也许可以将其与MotorClient结合使用以实现所需的功能?
经过更多工作并升级到python 3.7
,我想我已经提出了一个可行的解决方案。尽管我不确定JSON函数是否正在同步运行,可能会导致瓶颈?
import json
from bson.json_util import dumps
from motor.motor_asyncio import AsyncIOMotorClient
import asyncio
import asyncpg
from datetime import datetime
CLIENT = AsyncIOMotorClient('mongodb://<user>:<password>@<host>:27017')
DB = CLIENT.<database>
COLLECTIONS = ['person', 'booking']
async def insert_to_postgres(rows, collection):
con = await asyncpg.connect(user=<user>, password=<password>, database=<database>, host=<host>)
result = await con.copy_records_to_table(table_name=collection, schema_name='mongo', records = rows)
await con.close()
async def get_mongo_data(collection):
cursor = DB[collection].find(batch_size=1000)
my_result = []
async for document in cursor:
my_result.append((str(document.get('_id')), json.dumps(json.loads(dumps(document)))))
await insert_to_postgres(my_result, collection)
print(datetime.now(), collection)
start = datetime.now()
print(start, 'Starting')
tasks = [get_mongo_data(x) for x in COLLECTIONS]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(tasks))
stop = datetime.now()
print(datetime.now(), 'Stopping')
print((stop - start).seconds,'elapsed seconds')