请帮助我找到一种有效地从python熊猫中的S3存储桶读取多个文件的解决方案。我正在做一个ETL,它从S3存储桶中读取日志文件,进行相应的转换并将其加载到Redshift中。在一个小时内,可以生成成千上万个日志文件,最小起码为1600个文件,最多可以生成2500个文件。每个文件大小约为5MB。我正在使用以下代码从S3存储桶中读取文件,以附加到Pandas数据框中(一个小时的数据):
self.query_date1_d = '2020-09-01 10:00:00'
self.query_date2_d = '2020-09-01 11:00:00'
self.filter_date1 = '2020-09-01'
'''Creating a PageIterator from the Paginator to load objects from S3 bucket'''
self.page_iterator = self.paginator.paginate(**self.operation_parameters)
self.jsondf = pd.DataFrame()
self.s3_files = []
for self.page in self.page_iterator:
for self.s3_objects in self.page['Contents']:
if self.filter_date1 in self.s3_objects['Key']:
self.file_name = self.s3_objects['Key']
self.last_modified_date = self.s3_objects['LastModified']
self.object_date = self.last_modified_date.strftime('%Y-%m-%d %H:%M:%S')
self.object_date = datetime.strptime(self.object_date, '%Y-%m-%d %H:%M:%S')
if self.query_date1_d <= self.object_date < self.query_date2_d:
self.s3_files.append(self.file_name)
print(self.file_name + ' :: ' + str(self.object_date))
self.obj = self.client.get_object(Bucket=self.my_bucket, Key=self.file_name)
self.initial_df = pd.read_json(self.obj['Body'], lines=True)
self.jsondf = self.jsondf.append(self.initial_df, ignore_index=True, sort=True)
return self.jsondf
但这只是从S3中读取文件并将其附加到Pandas数据框中而花费的两个多小时。有什么有效的方法可以快速读取文件?当前,存储桶已经包含大约20000个文件。 请帮助我!
下面是JSON示例:
{
"env": "production",
"event_type": "RESPONSE",
"log_type": "rest-api",
"method": "GET",
"user_id": "wcel",
"mid": "wcel",
"request_uri": "",
"app_name": "DF",
"ecs": {
"version": "1.0.0"
},
"@timestamp": "2020-08-27T06:27:11.293Z",
"tags": ["Some", "DF", "out-S3", "out-elastic-search"],
"ip_address": "00.00.00.00",
"log_time": "2020-08-27 06:27:11",
"payload": {
"Excel2Offer": [{
"contractType": "SIB",
"prodName": "Olivier",
"price": 1800.0,
"qty": 1,
"marketPrice": null,
"marketPriceDate": null,
"contractId": 807225,
"unitSize": "2",
"lastTradeDate": null,
"code": "2154",
"lastChangeOn": 548798,
"lastListPrice": null,
"wUrl": "/n11=10197192014",
"yourId": null,
"specialInfo": null,
"orderGuid": "2f75132e-6ff0-498c-978d-f57bc5953ec1",
"xCode": null,
"lastListDate": null,
"lastTradePrice": null,
"year": "2014",
"region": "south"
}],
"Excel2Offer1": [{
"contractType": "SIB",
"prodName": "Olivier",
"price": 1800.0,
"qty": 1,
"marketPrice": null,
"marketPriceDate": null,
"contractId": 807225,
"unitSize": "2",
"lastTradeDate": null,
"code": "2154",
"lastChangeOn": 548798,
"lastListPrice": null,
"wUrl": "/n11=10197192014",
"yourId": null,
"specialInfo": null,
"orderGuid": "2f75132e-6ff0-498c-978d-f57bc5953ec1",
"xCode": null,
"lastListDate": null,
"lastTradePrice": null,
"year": "2014",
"region": "south"
}]
},
"relation_id": "5487we-asd4-87we-65qw-54a2154qw"
}
谢谢。