Boto2

import zipfile
import boto
import io

# Connect to s3
# This will need your s3 credentials to be set up 
# with `aws configure` using the aws CLI.
#
# See: https://aws.amazon.com/cli/
conn = boto.s3.connect_s3()

# get hold of the bucket
bucket = conn.get_bucket("my_bucket_name")

# Get hold of a given file
key = boto.s3.key.Key(bucket)
key.key = "my_s3_object_key"

# Create an in-memory bytes IO buffer
with io.BytesIO() as b:

    # Read the file into it
    key.get_file(b)

    # Reset the file pointer to the beginning
    b.seek(0)

    # Read the file as a zipfile and process the members
    with zipfile.ZipFile(b, mode='r') as zipf:
        for subfile in zipf.namelist():
            do_stuff_with_subfile()

Boto3

import zipfile
import boto3
import io

# this is just to demo. real use should use the config 
# environment variables or config file.
#
# See: http://boto3.readthedocs.org/en/latest/guide/configuration.html

session = boto3.session.Session(
    aws_access_key_id="ACCESSKEY", 
    aws_secret_access_key="SECRETKEY"
)

s3 = session.resource("s3")
bucket = s3.Bucket('stackoverflow-brice-test')
obj = bucket.Object('smsspamcollection.zip')

with io.BytesIO(obj.get()["Body"].read()) as tf:

    # rewind the file
    tf.seek(0)

    # Read the file as a zipfile and process the members
    with zipfile.ZipFile(tf, mode='r') as zipf:
        for subfile in zipf.namelist():
            print(subfile)

使用Python3在MacOSX上测试。

Answer 2

如果速度是一个问题，一个好的方法是选择一个非常接近你的S3存储桶（在同一区域）的EC2实例，并使用该实例解压缩/处理你的压缩文件。

这将减少延迟，并允许您相当有效地处理它们。完成工作后，您可以删除每个提取的文件。

注意：这只适用于使用EC2实例的情况。

Answer 3

我相信您已经听到boto Python interface to Amazon Web Services

您可以key从s3获得file。

import boto
import zipfile.ZipFile as ZipFile

s3 = boto.connect_s3() # connect
bucket = s3.get_bucket(bucket_name) # get bucket
key = bucket.get_key(key_name) # get key (the file in s3)
key.get_file(local_name) # set this to temporal file

with ZipFile(local_name, 'r') as myzip:
    # do something with myzip

os.unlink(local_name) # delete it

您也可以使用tempfile。有关更多详细信息，请参阅create & read from tempfile

Answer 4

Pandas为此提供了一个快捷方式，该快捷方式从top answer中删除了大部分代码，并使您可以不必担心文件路径是在s3，gcp还是本地计算机上。

import pandas as pd  

obj = pd.io.parsers.get_filepath_or_buffer(file_path)[0]
with io.BytesIO(obj.read()) as byte_stream:
    # Use your byte stream, to, for example, print file names...
    with zipfile.ZipFile(byte_stream, mode='r') as zipf:
        for subfile in zipf.namelist():
            print(subfile)

Python S3下载zip文件

4 个答案:

Boto2

Boto3