提取大型档案并将其上传到AWS S3存储桶的问题

时间:2019-12-03 15:11:12

标签: python amazon-s3 multiprocessing extraction tarfile

所以我有一个正在处理的项目,我需要做的第一步是将存储在AWS S3存储桶中的一堆档案中的数据提取到一个单独的存储桶中。我试图尽快运行此过程,所以我正在使用ThreadPoolExecutor并行运行提取过程。但是,其中一些存档非常大(即压缩了95GB),在处理它们时,我似乎遇到了内存问题。

当前我正在做的是

1:从源S3存储桶中收集压缩的归档文件

2:阅读档案文件

3:检查该归档文件是否已被提取,如果没有,请继续执行步骤4和5

4:将档案提取到字典中,其中“键”是档案的名称,“值”是档案中存储的所有文件的列表(我认为这是问题所在)

5:将提取的文件上传到目标S3存储桶

这是我的main.py

import utils
from threadpool import ThreadPool

if __name__ == '__main__':
    args = utils.collect_args()

    staging_bucket = utils.create_staging_bucket()

    print('Collecting archives...')
    archives = utils.collect_archives(args.prev_output)

    print('\nReading, extracting, and uploading archives...\n')
    pool = ThreadPool(archives['Archives'], utils.read_archive)
    pool.create_pool()
    pool.start_pool(src_bucket = archives['Bucket'], dest_bucket = staging_bucket)

这是我的ThreadPool类

import concurrent.futures
import os
import time
from functools import partial

class ThreadPool():
    def __init__(self, items, func):
        self.items = items
        self.func = func
        self.workers = 8
        self.pool = None

    def create_pool(self):
        pool = concurrent.futures.ThreadPoolExecutor(max_workers = self.workers)
        self.pool = pool

    def start_pool(self, **kwargs):
        assert self.pool is not None, 'No pool has been created, please run create_pool first.'

        threads = []
        for item in self.items:
            thread = self.pool.submit(partial(self.func, **kwargs), item)
            threads.append(thread)



        pending_jobs = len(self.pool._work_queue.queue)
        print(f'Pending Jobs: {pending_jobs}')
        print(f'Threads: {len(self.pool._threads)}\n')


        if pending_jobs > 0:
            print('Estimated Pending Work Queue:\n')
            for num, item in enumerate(self.pool._work_queue.queue):
                print('{}\t{}\t{}\t{}'.format(num + 1, item.fn, item.args, item.kwargs))


        for thread in concurrent.futures.as_completed(threads):
            time.sleep(30)
            os.system('cls')
            pending_jobs = len(self.pool._work_queue.queue)

            print(f'\nPending Jobs: {pending_jobs}')
            if pending_jobs > 0:
                print('Estimated Pending Work Queue:')
                for num, item in enumerate(self.pool._work_queue.queue):
                    print('{}\t{}\t{}\t{}'.format(num + 1, item.fn, item.args, item.kwargs))

        _, _ = concurrent.futures.wait(threads)

以下是在合并过程中触发的功能

def collect_archives(prev_output):
    s3 = create_s3_session()

    with open(prev_output, 'rb') as input:
        data = input.read().decode('utf-8')
        data = json.loads(data)

    archives_dict = {'Bucket': data['input_bucket'], 'Archives': None}
    archives = []
    for obj in s3.Bucket(data['input_bucket']).objects.all():
        archives.append(obj.key)

    archives_dict['Archives'] = archives
    return archives_dict

def read_archive(archive, **kwargs):
    s3 = create_s3_session()

    obj = s3.Object(kwargs['src_bucket'], archive)
    buffer = io.BytesIO(obj.get()['Body'].read())
    if obj.content_type == 'application/zip':
        zip = zipfile.ZipFile(buffer)
        size = sum([zinfo.file_size for zinfo in zip.filelist])
        extract_archive({'Archive':zip, 'Size':size, 'Src':obj.key}, **kwargs)
        #results.append({'Archive':zip, 'Size':size, 'Src': obj.key})

        #results.append({'Archive':zip, 'Src': obj.key})
    elif obj.content_type == 'application/gzip':
        tar = tarfile.open(None, 'r:gz', fileobj = buffer)
        size = sum([tinfo.size for tinfo in tar.getmembers()])
        extract_archive({'Archive':tar, 'Size':size, 'Src':obj.key}, **kwargs)
        #results.append({'Archive':tar, 'Size': size, 'Src': obj.key})
        #results.append({'Archive':tar, 'Src': obj.key})

    #return tuple(results)

def is_archive_extracted(archive_data, staging_bucket):
    s3 = create_s3_session()

    archive = archive_data['Archive']
    size = archive_data['Size']
    src = archive_data['Src'].rsplit('.', 1)[0]

    extracted_size = 0
    for obj in s3.Bucket(staging_bucket).objects.filter(Prefix = 'extracted_data/' + src):
        extracted_size += obj.size

    print(f'\n{size}, {extracted_size}')
    if extracted_size != size:
        return False

    return True

def extract_archive(archive_data, **kwargs):
    #archive_data = archive_data[0]
    archive = archive_data['Archive']
    src = archive_data['Src']
    staging_bucket = kwargs['dest_bucket']

    extracted = is_archive_extracted(archive_data, staging_bucket)
    if not extracted:
        print(f'Extracting archive {src}')
        root_dir = src.rsplit('.', 1)[0]
        archive_content = {'Src': src, 'Archive': archive, 'Files': {}}
        if type(archive) == zipfile.ZipFile:
            for file in archive.infolist():
                key = 'extracted_data/' + root_dir + '/' + file.filename
                archive_content['Files'][key] = file

        if type(archive) == tarfile.TarFile:
            for member in archive.getmembers():
                file = archive.extractfile(member)
                if file is not None:
                    key = 'extracted_data/' + root_dir + '/' + member.name
                    archive_content['Files'][key] = file

        upload_archive(archive_content, staging_bucket)
        #return archive_content
    else:
        print(f'Archive {src} has already been extracted and uploaded to s3 bucket {staging_bucket}\n')

def upload_archive(archive_data, staging_bucket):
    s3 = create_s3_session()

    archive = archive_data['Archive']
    files = archive_data['Files']
    src = archive_data['Src']

    print(f'Uploading files for {src} to s3 bucket {staging_bucket}')
    for k, v in files.items():
        try:
            s3.Object(staging_bucket, k).load()
            continue
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '404':
                if type(archive) == zipfile.ZipFile:
                    s3.meta.client.upload_fileobj(
                        archive.open(v),
                        Bucket = staging_bucket,
                        Key = k
                    )
                elif type(archive) == tarfile.TarFile:
                    s3.meta.client.upload_fileobj(
                        v,
                        Bucket = staging_bucket,
                        Key = k
                    )

    print(f'Files uploaded for {src} to s3 bucket {staging_bucket}\n')

任何反馈将不胜感激。

0 个答案:

没有答案