如何在AWS S3中对zip文件进行计数而不下载它?

时间:2017-01-22 09:10:57

标签: python amazon-web-services amazon-s3 boto

案例: S3存储桶中有一个包含大量图像的大型zip文件。有没有办法没有下载整个文件来读取元数据或知道zip文件中有多少文件?


6 个答案:

答案 0 :(得分:1)


import zlib
import zipfile
import io

def fetch(key_name, start, len, client_s3):
    range-fetches a S3 key
    end = start + len - 1
    s3_object = client_s3.get_object(Bucket=bucket_name, Key=key_name, Range="bytes=%d-%d" % (start, end))
    return s3_object['Body'].read()

def parse_int(bytes):
    parses 2 or 4 little-endian bits into their corresponding integer value
    val = (bytes[0]) + ((bytes[1]) << 8)
    if len(bytes) > 3:
        val += ((bytes[2]) << 16) + ((bytes[3]) << 24)
    return val

def list_files_in_s3_zipped_object(bucket_name, key_name, client_s3):

    List files in s3 zipped object, without downloading it. Returns the number of files inside the zip file.
    See : https://stackoverflow.com/questions/41789176/how-to-count-files-inside-zip-in-aws-s3-without-downloading-it
    Based on : https://stackoverflow.com/questions/51351000/read-zip-files-from-s3-without-downloading-the-entire-file

    bucket_name: name of the bucket
    key_name:  path to zipfile inside bucket
    client_s3: an object created using boto3.client("s3")

    bucket = bucket_name
    key = key_name

    response = client_s3.head_object(Bucket=bucket_name, Key=key_name)
    size = response['ContentLength']

    eocd = fetch(key_name, size - 22, 22, client_s3)

    # start offset and size of the central directory
    cd_start = parse_int(eocd[16:20])
    cd_size = parse_int(eocd[12:16])

    # fetch central directory, append EOCD, and open as zipfile!
    cd = fetch(key_name, cd_start, cd_size, client_s3)
    zip = zipfile.ZipFile(io.BytesIO(cd + eocd))

    print("there are %s files in the zipfile" % len(zip.filelist))

    for entry in zip.filelist:
        print("filename: %s (%s bytes uncompressed)" % (entry.filename, entry.file_size))
    return len(zip.filelist)

if __name__ == "__main__":
    import boto3
    import sys

    client_s3 = boto3.client("s3")
    bucket_name = sys.argv[1]
    key_name = sys.argv[2]
    list_files_in_s3_zipped_object(bucket_name, key_name, client_s3)

答案 1 :(得分:0)

如果不下载ZIP文件,则无法执行此操作。 S3可以对对象执行有限数量的操作。

请参阅Operations on Objects

答案 2 :(得分:0)




答案 3 :(得分:0)


jar vt < first-part-of-archive.zip


答案 4 :(得分:0)

试试下面的 linux s3 命​​令来获取计数

aws s3 cp <s3 file uri> - | gunzip -c | grep -i '<Search String>' | wc -l


aws s3 cp s3://test-bucket/test/test.gz - | gunzip -c | grep -i 'test' | wc -l

答案 5 :(得分:0)

我改进了已经给出的解决方案 - 现在它也可以处理大于 4GiB 的文件:

import boto3
import io
import struct
import zipfile

s3 = boto3.client('s3')


MAX_STANDARD_ZIP_SIZE = 4_294_967_295

def lambda_handler(event):
    bucket = event['bucket']
    key = event['key']
    zip_file = get_zip_file(bucket, key)

def get_zip_file(bucket, key):
    file_size = get_file_size(bucket, key)
    eocd_record = fetch(bucket, key, file_size - EOCD_RECORD_SIZE, EOCD_RECORD_SIZE)
    if file_size <= MAX_STANDARD_ZIP_SIZE:
        cd_start, cd_size = get_central_directory_metadata_from_eocd(eocd_record)
        central_directory = fetch(bucket, key, cd_start, cd_size)
        return zipfile.ZipFile(io.BytesIO(central_directory + eocd_record))
        zip64_eocd_record = fetch(bucket, key,
                                  file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE + ZIP64_EOCD_RECORD_SIZE),
        zip64_eocd_locator = fetch(bucket, key,
                                   file_size - (EOCD_RECORD_SIZE + ZIP64_EOCD_LOCATOR_SIZE),
        cd_start, cd_size = get_central_directory_metadata_from_eocd64(zip64_eocd_record)
        central_directory = fetch(bucket, key, cd_start, cd_size)
        return zipfile.ZipFile(io.BytesIO(central_directory + zip64_eocd_record + zip64_eocd_locator + eocd_record))

def get_file_size(bucket, key):
    head_response = s3.head_object(Bucket=bucket, Key=key)
    return head_response['ContentLength']

def fetch(bucket, key, start, length):
    end = start + length - 1
    response = s3.get_object(Bucket=bucket, Key=key, Range="bytes=%d-%d" % (start, end))
    return response['Body'].read()

def get_central_directory_metadata_from_eocd(eocd):
    cd_size = parse_little_endian_to_int(eocd[12:16])
    cd_start = parse_little_endian_to_int(eocd[16:20])
    return cd_start, cd_size

def get_central_directory_metadata_from_eocd64(eocd64):
    cd_size = parse_little_endian_to_int(eocd64[40:48])
    cd_start = parse_little_endian_to_int(eocd64[48:56])
    return cd_start, cd_size

def parse_little_endian_to_int(little_endian_bytes):
    format_character = "i" if len(little_endian_bytes) == 4 else "q"
    return struct.unpack("<" + format_character, little_endian_bytes)[0]

def print_zip_content(zip_file):
    files = [zi.filename for zi in zip_file.filelist]
    print(f"{len(files)} files: {files}")