我正在尝试使用AWS Glue ETL工具将CSV文件解析为Parquet。我正在从亚马逊网站关注this tutorial。
由于我在S3存储桶上有一堆CSV文件,因此我将其内容压缩为GZIP。我正在使用以下lambda函数从zip下载数据,提取CSV文件并将其另存为压缩的CSV文件到我的S3存储桶中:
import json
import boto3
from botocore.vendored import requests
import zipfile
from gzip import GzipFile
from io import BytesIO
def lambda_handler(event, context):
s3 = boto3.resource('s3')
bucket_name = "my-bucket"
file_url = 'http://dados.cvm.gov.br/dados/FI/DOC/CDA/DADOS/'
file_name = "cda_fi_201801"
req = requests.get(file_url + file_name+".zip", stream=True)
data = req.raw.read()
zf = zipfile.ZipFile(BytesIO(data))
for fn in zf.namelist():
bytes = zf.read(fn).decode("windows-1252")
print ('File:', fn)
print ('has',len(bytes),'bytes')
# Choose folder name to put csv file
parts = fn.split("_")
folder = "PL"
if(parts[2] == "BLC"):
folder = "BLC_"+parts[3]
# BytesIO to not save to disk
gz_body = BytesIO()
gz = GzipFile(None, 'wb', 9, gz_body)
# Write csv bytes to gzip body
gz.write(bytes.encode('utf8'))
s3.Bucket(bucket_name).put_object(Key=folder + "/" + fn,
ContentType="text/plain",
ContentEncoding='gzip',
Body=gz_body.getvalue())
gz.close()
return {
'statusCode': 200,
}
在blc_1文件夹上运行AWS Glue爬网程序,我得到以下表属性:
{
"StorageDescriptor": {
"cols": {
"FieldSchema": [
{
"name": "tp_fundo",
"type": "string",
"comment": ""
},
{
"name": "cnpj_fundo",
"type": "string",
"comment": ""
},
{
"name": "denom_social",
"type": "string",
"comment": ""
},
{
"name": "dt_comptc",
"type": "string",
"comment": ""
},
{
"name": "tp_aplic",
"type": "string",
"comment": ""
},
{
"name": "tp_ativo",
"type": "string",
"comment": ""
},
{
"name": "emissor_ligado",
"type": "string",
"comment": ""
},
{
"name": "tp_negoc",
"type": "string",
"comment": ""
},
{
"name": "qt_venda_negoc",
"type": "double",
"comment": ""
},
{
"name": "vl_venda_negoc",
"type": "double",
"comment": ""
},
{
"name": "qt_aquis_negoc",
"type": "double",
"comment": ""
},
{
"name": "vl_aquis_negoc",
"type": "double",
"comment": ""
},
{
"name": "qt_pos_final",
"type": "double",
"comment": ""
},
{
"name": "vl_merc_pos_final",
"type": "double",
"comment": ""
},
{
"name": "vl_custo_pos_final",
"type": "string",
"comment": ""
},
{
"name": "dt_confid_aplic",
"type": "string",
"comment": ""
},
{
"name": "tp_titpub",
"type": "string",
"comment": ""
},
{
"name": "cd_isin",
"type": "string",
"comment": ""
},
{
"name": "cd_selic",
"type": "bigint",
"comment": ""
},
{
"name": "dt_emissao",
"type": "string",
"comment": ""
},
{
"name": "dt_venc",
"type": "string",
"comment": ""
}
]
},
"location": "s3://my-bucket/BLC_1/",
"inputFormat": "org.apache.hadoop.mapred.TextInputFormat",
"outputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
"compressed": "true",
"numBuckets": "-1",
"SerDeInfo": {
"name": "",
"serializationLib": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
"parameters": {
"field.delim": ";"
}
},
"bucketCols": [],
"sortCols": [],
"parameters": {
"skip.header.line.count": "1",
"sizeKey": "731056",
"objectCount": "1",
"UPDATED_BY_CRAWLER": "blc-1",
"CrawlerSchemaSerializerVersion": "1.0",
"recordCount": "1884",
"averageRecordSize": "258",
"CrawlerSchemaDeserializerVersion": "1.0",
"compressionType": "gzip",
"classification": "csv",
"columnsOrdered": "true",
"areColumnsQuoted": "false",
"delimiter": ";",
"typeOfData": "file"
},
"SkewedInfo": {},
"storedAsSubDirectories": "false"
},
"parameters": {
"skip.header.line.count": "1",
"sizeKey": "731056",
"objectCount": "1",
"UPDATED_BY_CRAWLER": "blc-1",
"CrawlerSchemaSerializerVersion": "1.0",
"recordCount": "1884",
"averageRecordSize": "258",
"CrawlerSchemaDeserializerVersion": "1.0",
"compressionType": "gzip",
"classification": "csv",
"columnsOrdered": "true",
"areColumnsQuoted": "false",
"delimiter": ";",
"typeOfData": "file"
}
}
此后,我按照教程进行操作,并尝试使用由Glue自动生成的以下脚本运行ETL作业:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "cvm", table_name = "blc_1", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "cvm", table_name = "blc_1", transformation_ctx = "datasource0")
## @type: ApplyMapping
## @args: [mapping = [("tp_fundo", "string", "tp_fundo", "string"), ("cnpj_fundo", "string", "cnpj_fundo", "string"), ("denom_social", "string", "denom_social", "string"), ("dt_comptc", "string", "dt_comptc", "string"), ("tp_aplic", "string", "tp_aplic", "string"), ("tp_ativo", "string", "tp_ativo", "string"), ("emissor_ligado", "string", "emissor_ligado", "string"), ("tp_negoc", "string", "tp_negoc", "string"), ("qt_venda_negoc", "double", "qt_venda_negoc", "double"), ("vl_venda_negoc", "double", "vl_venda_negoc", "double"), ("qt_aquis_negoc", "double", "qt_aquis_negoc", "double"), ("vl_aquis_negoc", "double", "vl_aquis_negoc", "double"), ("qt_pos_final", "double", "qt_pos_final", "double"), ("vl_merc_pos_final", "double", "vl_merc_pos_final", "double"), ("vl_custo_pos_final", "string", "vl_custo_pos_final", "string"), ("dt_confid_aplic", "string", "dt_confid_aplic", "string"), ("tp_titpub", "string", "tp_titpub", "string"), ("cd_isin", "string", "cd_isin", "string"), ("cd_selic", "long", "cd_selic", "long"), ("dt_emissao", "string", "dt_emissao", "string"), ("dt_venc", "string", "dt_venc", "string")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("tp_fundo", "string", "tp_fundo", "string"), ("cnpj_fundo", "string", "cnpj_fundo", "string"), ("denom_social", "string", "denom_social", "string"), ("dt_comptc", "string", "dt_comptc", "string"), ("tp_aplic", "string", "tp_aplic", "string"), ("tp_ativo", "string", "tp_ativo", "string"), ("emissor_ligado", "string", "emissor_ligado", "string"), ("tp_negoc", "string", "tp_negoc", "string"), ("qt_venda_negoc", "double", "qt_venda_negoc", "double"), ("vl_venda_negoc", "double", "vl_venda_negoc", "double"), ("qt_aquis_negoc", "double", "qt_aquis_negoc", "double"), ("vl_aquis_negoc", "double", "vl_aquis_negoc", "double"), ("qt_pos_final", "double", "qt_pos_final", "double"), ("vl_merc_pos_final", "double", "vl_merc_pos_final", "double"), ("vl_custo_pos_final", "string", "vl_custo_pos_final", "string"), ("dt_confid_aplic", "string", "dt_confid_aplic", "string"), ("tp_titpub", "string", "tp_titpub", "string"), ("cd_isin", "string", "cd_isin", "string"), ("cd_selic", "long", "cd_selic", "long"), ("dt_emissao", "string", "dt_emissao", "string"), ("dt_venc", "string", "dt_venc", "string")], transformation_ctx = "applymapping1")
## @type: ResolveChoice
## @args: [choice = "make_struct", transformation_ctx = "resolvechoice2"]
## @return: resolvechoice2
## @inputs: [frame = applymapping1]
resolvechoice2 = ResolveChoice.apply(frame = applymapping1, choice = "make_struct", transformation_ctx = "resolvechoice2")
## @type: DropNullFields
## @args: [transformation_ctx = "dropnullfields3"]
## @return: dropnullfields3
## @inputs: [frame = resolvechoice2]
dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3")
## @type: DataSink
## @args: [connection_type = "s3", connection_options = {"path": "s3://my-bucket/blc_1"}, format = "parquet", transformation_ctx = "datasink4"]
## @return: datasink4
## @inputs: [frame = dropnullfields3]
datasink4 = glueContext.write_dynamic_frame.from_options(frame = dropnullfields3, connection_type = "s3", connection_options = {"path": "s3://my-bucket/blc_1"}, format = "parquet", transformation_ctx = "datasink4")
job.commit()
我遇到以下错误:
19/03/27 19:10:07警告TaskSetManager:在阶段0.0(TID中丢失了任务0.0 0,ip-172-32-89-229.us-east-2.compute.internal,执行程序1): com.amazonaws.services.glue.util.FatalException:无法解析文件: cda_fi_BLC_1_201801.csv
在com.amazonaws.services.glue.readers.JacksonReader.hasNextFailSafe(JacksonReader.scala:94) 在 com.amazonaws.services.glue.readers.JacksonReader.hasNext(JacksonReader.scala:38) 在 com.amazonaws.services.glue.hadoop.TapeHadoopRecordReader.nextKeyValue(TapeHadoopRecordReader.scala:63) 在 org.apache.spark.rdd.NewHadoopRDD $$ anon $ 1.hasNext(NewHadoopRDD.scala:207) 在 org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37) 在scala.collection.Iterator $$ anon $ 11.hasNext(Iterator.scala:408)在 scala.collection.Iterator $$ anon $ 11.hasNext(Iterator.scala:408)在 org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:148) 在 org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96) 在org.apache.spark.scheduler.ShuffleMapTask中。
不幸的是,它没有提供任何线索。当未压缩CSV内容时,我能够执行ETL,所以我最好的猜测是我在gzip压缩上做错了什么,或者缺少某些配置。
如果您对正在发生的事情有所了解,请多多帮助。
谢谢!
答案 0 :(得分:0)
此TapeHadoopRecordReader非常敏感。我建议你这样做:
df = spark.read.csv('s3://path')
然后您可以使用fromDF()