我正在尝试使用Pymongo查询mongodb文档并尝试将它们写入s3。我想以json数组格式将文件保存到s3。我可以轻松地将它恢复到mongo。 Mongodb查询正在动态构建,我没有选择将文档保存在某个临时位置然后上传。
下面的代码段工作但它会消耗大量内存。要复制1.5GB数据,它会消耗~12GB的物理内存。
mongo_query = {"$and": [{"ABC": school_year_id.upper()}, {"XYZ": clientid}]
plans = mongo_conn[self.database][self.collection]
plans_archivable_docs = plans.find(mongo_query)
s3_key = school_year + '/' + clientid + '/' + self.database + self.collection + '.json'
s3_client = mngarchs3.get_s3_client()
response = s3_client.put_object(ACL='private',
Bucket='xxxx-mongo-archives',
Key=s3_key,
Body=dumps(plans_archivable_docs)
)
print (response)
是否有更多内存有效的选项呢?
答案 0 :(得分:0)
在您的方案中申请大文件的策略是将其成块上传到Amazon S3存储桶。
您可以使用boto3
库中可用的MultipartUpload
API。如果要在上传后将块连接到单个文件,我建议采用这种方式。
这说明,如果您认为串联块在此处不重要,则可以实现幼稚的编写器。 一个天真的作者,可以分块地上载集合中的文档,可以写成如下。
请注意,您可以对此进行改进以处理错误并重试。您还可以并行运行上传。
import json
import io
import boto3
from pymongo import MongoClient
class ObjectChunkIO:
def __init__(self, bucket, prefix):
self.bucket = bucket
self.prefix = prefix
self.iterator = self.chunk_iterator()
def next_chunk(self):
return next(self.iterator)
def chunk_iterator(self):
raise NotImplementedError
class ObjectChunkWriter(ObjectChunkIO):
def __init__(self, bucket, prefix, chunk_size=5e7):
super().__init__(bucket, prefix)
self.chunk_num = 0
# the object size in bytes. defaults to 50MB
self.chunk_size = chunk_size
def _upload_buf(self, object_data):
self.bucket.put_object(Body=object_data, Key=self.path())
def chunk_iterator(self):
while True:
buffer = io.BytesIO()
yield buffer
buffer.close()
self.chunk_num += 1
def path(self):
return '{prefix}.{0:03d}.json'.format(
self.chunk_num, prefix=self.prefix)
def write(self, cur):
with self.next_chunk() as buf:
for doc in cur:
b_doc = json.dumps(doc).encode('utf-8')
if buf.tell() + len(b_doc) > self.chunk_size:
self._upload_buf(buf.getvalue())
buf = self.next_chunk()
buf.writelines([b_doc, b'\n'])
self._upload_buf(buf.getvalue())
这可以通过以下方式使用:
def get_collection():
db_client = MongoClient()
db = db_client.<database>
return db.<collection>
def get_bucket():
session = boto3.Session(profile_name='<profile>')
s3_resource = session.resource('s3')
return s3_resource.Bucket('<bucket>')
def upload_in_chunks(cur, bucket):
''' Uploads documents returned in a MongoDb cur in chunks to S3 bucket
Args:
- cur: pymongo.cursor.Cursor
- collection: s3.Bucket
'''
object_chunk_writer = ObjectChunkWriter(bucket,
prefix='streaming/sample-output/chunk')
object_chunk_writer.write(cur)
collection = get_collection()
cur = collection.find()
bucket = get_bucket()
upload_in_chunks(cur, bucket)
读取上传的块是相当容易的部分。
class ObjectChunkReader(ObjectChunkIO):
def chunk_iterator(self):
object_chunks = self.bucket.objects.filter(Prefix=self.prefix, MaxKeys=3)
for object_summary in object_chunks:
response = object_summary.get()
body = response['Body']
chunk_object = [json.loads(line) for line in body.iter_lines()]
yield chunk_object
def __iter__(self):
for object_chunk in self.iterator:
yield object_chunk
def write_objects_to_collection(bucket, collection):
''' Writes s3 objects to a MongoDb collection
Args:
- bucket: s3.Bucket
- collection: pymongo.collection.Collection
'''
object_chunk_reader = ObjectChunkReader(bucket,
prefix='streaming/sample-output/chunk')
for object_chunk in object_chunk_reader:
collection.insert_many(object_chunk)
collection = get_collection()
bucket = get_bucket()
write_objects_to_collection(bucket, collection)