我在S3的存储桶中有一个文件myfile.txt.gz
。我正在使用胶水作业将此文件转换为实木复合地板。代码如下所示。
import com.amazonaws.services.glue.ChoiceOption
import com.amazonaws.services.glue.GlueContext
import com.amazonaws.services.glue.MappingSpec
import com.amazonaws.services.glue.ResolveSpec
import com.amazonaws.services.glue.errors.CallSite
import com.amazonaws.services.glue.util.GlueArgParser
import com.amazonaws.services.glue.util.Job
import com.amazonaws.services.glue.util.JsonOptions
import org.apache.spark.SparkContext
import scala.collection.JavaConverters._
import com.amazonaws.services.glue.DynamicFrame
import org.apache.hadoop.fs._;
import org.apache.spark.sql.functions._
object GlueApp {
def main(sysArgs: Array[String]) {
val spark: SparkContext = new SparkContext()
val glueContext: GlueContext = new GlueContext(spark)
val args = GlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray)
Job.init(args("JOB_NAME"), glueContext, args.asJava)
val truncColUdf = udf((str: String) => if (str.length > 29999) str.substring(0, 29999) else str)
//myfile
val datasource21 = glueContext.getCatalogSource(database = "data", tableName = "myfile", redshiftTmpDir = "", transformationContext = "datasource21").getDynamicFrame()
val revDF21_1 = datasource21.toDF().withColumn("body", truncColUdf(col("body")))
val truncDynamicFrame21_1 = DynamicFrame(revDF21_1, glueContext)
val applymapping21 = truncDynamicFrame21_1.applyMapping(mappings = Seq(("id", "bigint","id", "bigint"),
("body", "varchar(65535)", "body", "varchar(65535)")), caseSensitive = false, transformationContext = "applymapping21")
val resolvechoice21 = applymapping21.resolveChoice(choiceOption = Some(ChoiceOption("make_struct")), transformationContext = "resolvechoice21")
val dropnullfields21 = resolvechoice21.dropNulls(transformationContext = "dropnullfields21")
val datasink21 = glueContext.getSinkWithFormat(connectionType = "s3", options = JsonOptions("""{"path": "s3://mypath/myfilefolder"}"""), transformationContext = "datasink21", format = "parquet").writeDynamicFrame(dropnullfields21)
Job.commit()
}
}
运行此作业时,出现错误消息:
java.util.concurrent.RejectedExecutionException ThreadPoolExecutor已关闭
但是,当我运行相同的确切任务但针对相同的文件myfile.txt
但未压缩时,它会按预期工作。我也尝试了在没有截断线的情况下运行,并得到了相同的错误。我想知道这个错误是什么意思,为什么我会得到它。
编辑:经过更多测试,我认为该错误是由截断引起的,因为不需要截断工作的所有文件。我能想到的一件事可能是引起问题的是,因为所有文件都已压缩,因此无法读取列以执行截断功能。任何人都可以确认这一点并可能提供解决方法吗?预先谢谢你。