所以我有一个正在处理的项目,我需要做的第一步是将存储在AWS S3存储桶中的一堆档案中的数据提取到一个单独的存储桶中。我试图尽快运行此过程,所以我正在使用ThreadPoolExecutor并行运行提取过程。但是,其中一些存档非常大(即压缩了95GB),在处理它们时,我似乎遇到了内存问题。
当前我正在做的是
1:从源S3存储桶中收集压缩的归档文件
2:阅读档案文件
3:检查该归档文件是否已被提取,如果没有,请继续执行步骤4和5
4:将档案提取到字典中,其中“键”是档案的名称,“值”是档案中存储的所有文件的列表(我认为这是问题所在)
5:将提取的文件上传到目标S3存储桶
这是我的main.py
import utils
from threadpool import ThreadPool
if __name__ == '__main__':
args = utils.collect_args()
staging_bucket = utils.create_staging_bucket()
print('Collecting archives...')
archives = utils.collect_archives(args.prev_output)
print('\nReading, extracting, and uploading archives...\n')
pool = ThreadPool(archives['Archives'], utils.read_archive)
pool.create_pool()
pool.start_pool(src_bucket = archives['Bucket'], dest_bucket = staging_bucket)
这是我的ThreadPool类
import concurrent.futures
import os
import time
from functools import partial
class ThreadPool():
def __init__(self, items, func):
self.items = items
self.func = func
self.workers = 8
self.pool = None
def create_pool(self):
pool = concurrent.futures.ThreadPoolExecutor(max_workers = self.workers)
self.pool = pool
def start_pool(self, **kwargs):
assert self.pool is not None, 'No pool has been created, please run create_pool first.'
threads = []
for item in self.items:
thread = self.pool.submit(partial(self.func, **kwargs), item)
threads.append(thread)
pending_jobs = len(self.pool._work_queue.queue)
print(f'Pending Jobs: {pending_jobs}')
print(f'Threads: {len(self.pool._threads)}\n')
if pending_jobs > 0:
print('Estimated Pending Work Queue:\n')
for num, item in enumerate(self.pool._work_queue.queue):
print('{}\t{}\t{}\t{}'.format(num + 1, item.fn, item.args, item.kwargs))
for thread in concurrent.futures.as_completed(threads):
time.sleep(30)
os.system('cls')
pending_jobs = len(self.pool._work_queue.queue)
print(f'\nPending Jobs: {pending_jobs}')
if pending_jobs > 0:
print('Estimated Pending Work Queue:')
for num, item in enumerate(self.pool._work_queue.queue):
print('{}\t{}\t{}\t{}'.format(num + 1, item.fn, item.args, item.kwargs))
_, _ = concurrent.futures.wait(threads)
以下是在合并过程中触发的功能
def collect_archives(prev_output):
s3 = create_s3_session()
with open(prev_output, 'rb') as input:
data = input.read().decode('utf-8')
data = json.loads(data)
archives_dict = {'Bucket': data['input_bucket'], 'Archives': None}
archives = []
for obj in s3.Bucket(data['input_bucket']).objects.all():
archives.append(obj.key)
archives_dict['Archives'] = archives
return archives_dict
def read_archive(archive, **kwargs):
s3 = create_s3_session()
obj = s3.Object(kwargs['src_bucket'], archive)
buffer = io.BytesIO(obj.get()['Body'].read())
if obj.content_type == 'application/zip':
zip = zipfile.ZipFile(buffer)
size = sum([zinfo.file_size for zinfo in zip.filelist])
extract_archive({'Archive':zip, 'Size':size, 'Src':obj.key}, **kwargs)
#results.append({'Archive':zip, 'Size':size, 'Src': obj.key})
#results.append({'Archive':zip, 'Src': obj.key})
elif obj.content_type == 'application/gzip':
tar = tarfile.open(None, 'r:gz', fileobj = buffer)
size = sum([tinfo.size for tinfo in tar.getmembers()])
extract_archive({'Archive':tar, 'Size':size, 'Src':obj.key}, **kwargs)
#results.append({'Archive':tar, 'Size': size, 'Src': obj.key})
#results.append({'Archive':tar, 'Src': obj.key})
#return tuple(results)
def is_archive_extracted(archive_data, staging_bucket):
s3 = create_s3_session()
archive = archive_data['Archive']
size = archive_data['Size']
src = archive_data['Src'].rsplit('.', 1)[0]
extracted_size = 0
for obj in s3.Bucket(staging_bucket).objects.filter(Prefix = 'extracted_data/' + src):
extracted_size += obj.size
print(f'\n{size}, {extracted_size}')
if extracted_size != size:
return False
return True
def extract_archive(archive_data, **kwargs):
#archive_data = archive_data[0]
archive = archive_data['Archive']
src = archive_data['Src']
staging_bucket = kwargs['dest_bucket']
extracted = is_archive_extracted(archive_data, staging_bucket)
if not extracted:
print(f'Extracting archive {src}')
root_dir = src.rsplit('.', 1)[0]
archive_content = {'Src': src, 'Archive': archive, 'Files': {}}
if type(archive) == zipfile.ZipFile:
for file in archive.infolist():
key = 'extracted_data/' + root_dir + '/' + file.filename
archive_content['Files'][key] = file
if type(archive) == tarfile.TarFile:
for member in archive.getmembers():
file = archive.extractfile(member)
if file is not None:
key = 'extracted_data/' + root_dir + '/' + member.name
archive_content['Files'][key] = file
upload_archive(archive_content, staging_bucket)
#return archive_content
else:
print(f'Archive {src} has already been extracted and uploaded to s3 bucket {staging_bucket}\n')
def upload_archive(archive_data, staging_bucket):
s3 = create_s3_session()
archive = archive_data['Archive']
files = archive_data['Files']
src = archive_data['Src']
print(f'Uploading files for {src} to s3 bucket {staging_bucket}')
for k, v in files.items():
try:
s3.Object(staging_bucket, k).load()
continue
except botocore.exceptions.ClientError as e:
if e.response['Error']['Code'] == '404':
if type(archive) == zipfile.ZipFile:
s3.meta.client.upload_fileobj(
archive.open(v),
Bucket = staging_bucket,
Key = k
)
elif type(archive) == tarfile.TarFile:
s3.meta.client.upload_fileobj(
v,
Bucket = staging_bucket,
Key = k
)
print(f'Files uploaded for {src} to s3 bucket {staging_bucket}\n')
任何反馈将不胜感激。