目标是尝试从s3加载大量“批量” json。我找到了aiobotocore
,并被敦促尝试以期获得更高的效率,同时熟悉asyncio
。我试了一下,就可以了,但是我基本上了解异步编程。因此,我希望有一些改进/评论。也许那里有些善良的灵魂可以发现一些明显的错误。
问题是boto3一次仅支持一个http请求。通过使用Threadpool
,我获得了重大改进,但我希望有一种更有效的方法。
这是代码:
进口:
import os
import asyncio
import aiobotocore
from itertools import chain
import json
from json.decoder import WHITESPACE
我在某个地方找到了一些辅助生成器,可以从具有多个json的字符串中返回解码的json。
def iterload(string_or_fp, cls=json.JSONDecoder, **kwargs):
'''helper for parsing individual jsons from string of jsons (stolen from somewhere)'''
string = str(string_or_fp)
decoder = cls(**kwargs)
idx = WHITESPACE.match(string, 0).end()
while idx < len(string):
obj, end = decoder.raw_decode(string, idx)
yield obj
idx = WHITESPACE.match(string, end).end()
此函数从s3存储桶中获取具有给定前缀的密钥:
# Async stuff starts here
async def get_keys(loop, bucket, prefix):
'''Get keys in bucket based on prefix'''
session = aiobotocore.get_session(loop=loop)
async with session.create_client('s3', region_name='us-west-2',
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_access_key_id=AWS_ACCESS_KEY_ID) as client:
keys = []
# list s3 objects using paginator
paginator = client.get_paginator('list_objects')
async for result in paginator.paginate(Bucket=bucket, Prefix=prefix):
for c in result.get('Contents', []):
keys.append(c['Key'])
return keys
此函数获取所提供密钥的内容。最重要的是,它拉平了解码内容的列表:
async def get_object(loop,bucket, key):
'''Get json content from s3 object'''
session = aiobotocore.get_session(loop=loop)
async with session.create_client('s3', region_name='us-west-2',
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_access_key_id=AWS_ACCESS_KEY_ID) as client:
# get object from s3
response = await client.get_object(Bucket=bucket, Key=key)
async with response['Body'] as stream:
content = await stream.read()
return list(iterload(content.decode()))
这是主要功能,它收集所有找到的键的内容并展平内容列表。
async def go(loop, bucket, prefix):
'''Returns list of dicts of object contents'''
session = aiobotocore.get_session(loop=loop)
async with session.create_client('s3', region_name='us-west-2',
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_access_key_id=AWS_ACCESS_KEY_ID) as client:
keys = await get_keys(loop, bucket, prefix)
contents = await asyncio.gather(*[get_object(loop, bucket, k) for k in keys])
return list(chain.from_iterable(contents))
最后,我运行此命令,并且字典的结果列表很好地以result
loop = asyncio.get_event_loop()
result = loop.run_until_complete(go(loop, 'some-bucket', 'some-prefix'))
我认为可能有点奇怪的一件事是,我在每个异步函数中都创建了一个客户端。可能可以取消。请注意有关aiobotocore
如何与多个客户端配合使用的信息。
此外,我认为您无需等待所有键都已加载,然后再加载键的对象,我认为在这种实现中就是这种情况。我假设一旦找到密钥,您就可以致电get_object
。因此,也许应该是async generator
。但是我在这里并不完全清楚。
先谢谢您!希望这对处于类似情况的人有所帮助。
答案 0 :(得分:1)
先签出aioboto3
第二秒,aiobotocore中的每个客户端都与aiohttp会话相关联。每个会话最多可以有max_pool_connections个。这就是为什么在basic aiobotocore example中对async with
做create_client
的原因。因此,使用客户端完成操作后,池将关闭。
以下是一些提示:
基于上述内容,我将如何做:
import asyncio
from itertools import chain
import json
from typing import List
from json.decoder import WHITESPACE
import logging
from functools import partial
# Third Party
import asyncpool
import aiobotocore.session
import aiobotocore.config
_NUM_WORKERS = 50
def iterload(string_or_fp, cls=json.JSONDecoder, **kwargs):
# helper for parsing individual jsons from string of jsons (stolen from somewhere)
string = str(string_or_fp)
decoder = cls(**kwargs)
idx = WHITESPACE.match(string, 0).end()
while idx < len(string):
obj, end = decoder.raw_decode(string, idx)
yield obj
idx = WHITESPACE.match(string, end).end()
async def get_object(s3_client, bucket: str, key: str):
# Get json content from s3 object
# get object from s3
response = await s3_client.get_object(Bucket=bucket, Key=key)
async with response['Body'] as stream:
content = await stream.read()
return list(iterload(content.decode()))
async def go(bucket: str, prefix: str) -> List[dict]:
"""
Returns list of dicts of object contents
:param bucket: s3 bucket
:param prefix: s3 bucket prefix
:return: list of dicts of object contents
"""
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
session = aiobotocore.session.AioSession()
config = aiobotocore.config.AioConfig(max_pool_connections=_NUM_WORKERS)
contents = []
async with session.create_client('s3', config=config) as client:
worker_co = partial(get_object, client, bucket)
async with asyncpool.AsyncPool(None, _NUM_WORKERS, 's3_work_queue', logger, worker_co,
return_futures=True, raise_on_join=True, log_every_n=10) as work_pool:
# list s3 objects using paginator
paginator = client.get_paginator('list_objects')
async for result in paginator.paginate(Bucket=bucket, Prefix=prefix):
for c in result.get('Contents', []):
contents.append(await work_pool.push(c['Key']))
# retrieve results from futures
contents = [c.result() for c in contents]
return list(chain.from_iterable(contents))
_loop = asyncio.get_event_loop()
_result = _loop.run_until_complete(go('some-bucket', 'some-prefix'))