我试图将s3
桶中的一堆拼花文件正确读入数据帧进行处理。我使用boto3
,我不太确定如何获取数据,然后将它们正确地union
放在一起(如果需要)。这是我尝试过的:
# def distributedParquetRead(s3_key, bucket_name='instructure-canvas-logs'):
# s3obj = b3.resource('s3').Object(
# bucket_name=bucket_name,
# key=s3_key
# )
# data_frame = sql_context.read.parquet(s3obj.get()['body'].read())
# return data_frame
if __name__ == '__main__':
s3 = b3.resource('s3')
bucket_name = 'instructure-canvas-logs'
log_bucket = s3.Bucket(bucket_name)
key = r'access-canvas/parquet/year=2016/month=04/'
keys = [s3_key.key for s3_key in log_bucket.objects.filter(Prefix=key)]
s3 = b3.client('s3').get_object(
Bucket=bucket_name,
Key=keys[0]
)
df = sql_context.read.parquet(s3['Body'])
df.select('user_id').show(1)
# pkeys = sc.parallelize(keys)
# data_frames = pkeys.flatMap(distributedParquetRead)
编辑:根据要求,这是Traceback:
Traceback (most recent call last):
File "/Users/mharris/Code/PycharmProjects/spark/canvas_logs/canvas_requests_logs.py", line 31, in <module>
df = sql_context.read.parquet(s3['Body'])
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 205, in parquet
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/pyspark.zip/pyspark/sql/column.py", line 61, in _to_seq
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 798, in __call__
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 785, in _get_args
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/py4j-0.9-src.zip/py4j/java_collections.py", line 501, in convert
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py", line 804, in __call__
File "/usr/local/Cellar/apache-spark/1.6.0/libexec/python/lib/py4j-0.9-src.zip/py4j/protocol.py", line 278, in get_command_part
AttributeError: 'StreamingBody' object has no attribute '_get_object_id'