使用云数据融合将xml数据加载到大查询接收器时面临的问题

时间:2019-11-14 14:31:59

标签: xml google-cloud-platform google-cloud-data-fusion cdap google-bigquery

  

我正在尝试通过将云数据融合成大文件来加载xml文件   查询数据接收器。我无法加载此特定的xml。


> org.apache.spark.SparkException: Job aborted due to stage failure:
> Task 0 in stage 0.0 failed 4 times, most recent failure: Lost task 0.3
> in stage 0.0 (TID 3,
> cdap-promopocp-c439e4ad-06e4-11ea-9714-ded3d232b4e4-w-1.c.p-asna-analytics-002.internal,
> executor 2): org.apache.spark.SparkException: Task failed while
> writing rows  at
> org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:155)
>   at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:83)
>   at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
>   at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
>   at org.apache.spark.scheduler.Task.run(Task.scala:109)  at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
>   at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>   at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>   at java.lang.Thread.run(Thread.java:748) Caused by:
> java.lang.NullPointerException    at
> io.cdap.plugin.gcp.bigquery.sink.AvroRecordWriter.close(AvroRecordWriter.java:104)
>   at
> org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.closeWriter(SparkHadoopWriter.scala:361)
>   at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:137)
>   at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:127)
>   at
> org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1415)
>   at
> org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:139)
>   ... 8 more  Suppressed: java.lang.NullPointerException      at
> io.cdap.plugin.gcp.bigquery.sink.AvroRecordWriter.close(AvroRecordWriter.java:104)
>       at
> org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.closeWriter(SparkHadoopWriter.scala:361)
>       at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$1.apply$mcV$sp(SparkHadoopWriter.scala:142)
>       at
> org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1424)
>       ... 9 more
> 
> Driver stacktrace:    at
> org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1661)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1649)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1648)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
> ~[scala-library-2.11.8.jar:na]    at
> scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
> ~[scala-library-2.11.8.jar:na]    at
> org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1648)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> scala.Option.foreach(Option.scala:257) ~[scala-library-2.11.8.jar:na]
>   at
> org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1882)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1831)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1820)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
> ~[spark-core_2.11-2.3.3.jar:na]   at
> org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
> ~[na:2.3.3]   at
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
> ~[na:2.3.3]   at
> org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
> ~[na:2.3.3]   at
> org.apache.spark.internal.io.SparkHadoopWriter$.write(SparkHadoopWriter.scala:78)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1083)
> [spark-core_2.11-2.3.3.jar:2.3.3]     at
> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1081)
> [spark-core_2.11-2.3.3.jar:2.3.3]     at
> org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1081)
> [spark-core_2.11-2.3.3.jar:2.3.3]     at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
> [spark-core_2.11-2.3.3.jar:2.3.3]     at
> org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
> [spark-core_2.11-2.3.3.jar:2.3.3]     at
> org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
> [spark-core_2.11-2.3.3.jar:2.3.3]     at
> org.apache.spark.rdd.PairRDDFunctions.saveAsNewAPIHadoopDataset(PairRDDFunctions.scala:1081)
> [spark-core_2.11-2.3.3.jar:2.3.3]     at
> io.cdap.cdap.app.runtime.spark.DefaultSparkExecutionContext.saveAsNewAPIHadoopDataset(DefaultSparkExecutionContext.scala:44)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.AbstractSparkExecutionContext$$anonfun$saveAsDataset$1.apply(AbstractSparkExecutionContext.scala:313)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.AbstractSparkExecutionContext$$anonfun$saveAsDataset$1.apply(AbstractSparkExecutionContext.scala:304)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.AbstractSparkExecutionContext$$anon$7.run(AbstractSparkExecutionContext.scala:451)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.SparkTransactional.execute(SparkTransactional.java:208)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.AbstractSparkExecutionContext.saveAsDataset(AbstractSparkExecutionContext.scala:442)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.AbstractSparkExecutionContext.saveAsDataset(AbstractSparkExecutionContext.scala:304)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.SerializableSparkExecutionContext.saveAsDataset(SerializableSparkExecutionContext.scala:68)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.DefaultJavaSparkExecutionContext.saveAsDataset(DefaultJavaSparkExecutionContext.scala:125)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.DefaultJavaSparkExecutionContext.saveAsDataset(DefaultJavaSparkExecutionContext.scala:117)
> [na:na]   at
> io.cdap.cdap.etl.spark.batch.SparkBatchSinkFactory.writeFromRDD(SparkBatchSinkFactory.java:103)
> [hydrator-spark-core2_2.11-6.1.0.jar:na]  at
> io.cdap.cdap.etl.spark.batch.RDDCollection$1.run(RDDCollection.java:179)
> [hydrator-spark-core2_2.11-6.1.0.jar:na]  at
> io.cdap.cdap.etl.spark.SparkPipelineRunner.runPipeline(SparkPipelineRunner.java:350)
> [hydrator-spark-core2_2.11-6.1.0.jar:na]  at
> io.cdap.cdap.etl.spark.batch.BatchSparkPipelineDriver.run(BatchSparkPipelineDriver.java:151)
> [hydrator-spark-core2_2.11-6.1.0.jar:na]  at
> io.cdap.cdap.app.runtime.spark.SparkTransactional$2.run(SparkTransactional.java:236)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.SparkTransactional.execute(SparkTransactional.java:208)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.SparkTransactional.execute(SparkTransactional.java:138)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.AbstractSparkExecutionContext.execute(AbstractSparkExecutionContext.scala:229)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.SerializableSparkExecutionContext.execute(SerializableSparkExecutionContext.scala:60)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.DefaultJavaSparkExecutionContext.execute(DefaultJavaSparkExecutionContext.scala:88)
> [na:na]   at
> io.cdap.cdap.api.Transactionals.execute(Transactionals.java:63)
> [na:na]   at
> io.cdap.cdap.etl.spark.batch.BatchSparkPipelineDriver.run(BatchSparkPipelineDriver.java:118)
> [hydrator-spark-core2_2.11-6.1.0.jar:na]  at
> io.cdap.cdap.app.runtime.spark.SparkMainWrapper$.main(SparkMainWrapper.scala:86)
> [na:na]   at
> io.cdap.cdap.app.runtime.spark.SparkMainWrapper.main(SparkMainWrapper.scala)
> [na:na]   at sun.reflect.NativeMethodAccessorImpl.invoke0(Native
> Method) ~[na:1.8.0_222]   at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> ~[na:1.8.0_222]   at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> ~[na:1.8.0_222]   at java.lang.reflect.Method.invoke(Method.java:498)
> ~[na:1.8.0_222]   at
> org.apache.spark.deploy.yarn.ApplicationMaster$$anon$4.run(ApplicationMaster.scala:721)
> [spark-yarn_2.11-2.3.3.jar:2.3.3] Caused by:
> org.apache.spark.SparkException: Task failed while writing rows   at
> org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:155)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:83)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.scheduler.Task.run(Task.scala:109)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
> ~[na:1.8.0_222]   at
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
> ~[na:1.8.0_222]   at java.lang.Thread.run(Thread.java:748)
> ~[na:1.8.0_222] Caused by: java.lang.NullPointerException: null   at
> io.cdap.plugin.gcp.bigquery.sink.AvroRecordWriter.close(AvroRecordWriter.java:104)
> ~[1573739273851-0/:na]    at
> org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.closeWriter(SparkHadoopWriter.scala:361)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:137)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:127)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    at
> org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1415)
> ~[spark-core_2.11-2.3.3.jar:na]   at
> org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:139)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]    ... 8 common frames omitted
>   Suppressed: java.lang.NullPointerException: null        at
> io.cdap.plugin.gcp.bigquery.sink.AvroRecordWriter.close(AvroRecordWriter.java:104)
> ~[1573739273851-0/:na]        at
> org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.closeWriter(SparkHadoopWriter.scala:361)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]        at
> org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$1.apply$mcV$sp(SparkHadoopWriter.scala:142)
> ~[spark-core_2.11-2.3.3.jar:2.3.3]        at
> org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1424)
> ~[spark-core_2.11-2.3.3.jar:na]       ... 9 common frames omitted

1 个答案:

答案 0 :(得分:2)

这是一个已解决的已知问题:https://issues.cask.co/browse/PLUGIN-83

请创建一个新实例,以确保您具有最新的修补程序。