我有一个相当简单的数据流作业,该作业从BigQuery读取并写入Spanner实例。所有读取均已完成,但由于某种原因,在开始写入Spanner时,它始终会失败并显示EOFException,并且它在MutationGroupEncoder类中。大约有580万行和1.36GB的数据已分区,但是在写入约3500行后失败。
下面的异常跟踪-堆栈跟踪中没有涉及我们的代码,所有库代码都在整理输出数据。我找不到其他人报告类似的错误。我们使用的是Google Cloud Apache Beam SDK 2.5.0版。
java.lang.RuntimeException: org.apache.beam.sdk.util.UserCodeException: java.lang.RuntimeException: java.io.EOFException
at com.google.cloud.dataflow.worker.GroupAlsoByWindowsParDoFn$1.output(GroupAlsoByWindowsParDoFn.java:183)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowFnRunner$1.outputWindowedValue(GroupAlsoByWindowFnRunner.java:102)
at com.google.cloud.dataflow.worker.util.BatchGroupAlsoByWindowViaIteratorsFn.processElement(BatchGroupAlsoByWindowViaIteratorsFn.java:124)
at com.google.cloud.dataflow.worker.util.BatchGroupAlsoByWindowViaIteratorsFn.processElement(BatchGroupAlsoByWindowViaIteratorsFn.java:53)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowFnRunner.invokeProcessElement(GroupAlsoByWindowFnRunner.java:115)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowFnRunner.processElement(GroupAlsoByWindowFnRunner.java:73)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowsParDoFn.processElement(GroupAlsoByWindowsParDoFn.java:113)
at com.google.cloud.dataflow.worker.util.common.worker.ParDoOperation.process(ParDoOperation.java:43)
at com.google.cloud.dataflow.worker.util.common.worker.OutputReceiver.process(OutputReceiver.java:48)
at com.google.cloud.dataflow.worker.util.common.worker.ReadOperation.runReadLoop(ReadOperation.java:200)
at com.google.cloud.dataflow.worker.util.common.worker.ReadOperation.start(ReadOperation.java:158)
at com.google.cloud.dataflow.worker.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:75)
at com.google.cloud.dataflow.worker.BatchDataflowWorker.executeWork(BatchDataflowWorker.java:391)
at com.google.cloud.dataflow.worker.BatchDataflowWorker.doWork(BatchDataflowWorker.java:360)
at com.google.cloud.dataflow.worker.BatchDataflowWorker.getAndPerformWork(BatchDataflowWorker.java:288)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.doWork(DataflowBatchWorkerHarness.java:134)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:114)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:101)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.beam.sdk.util.UserCodeException: java.lang.RuntimeException: java.io.EOFException
at org.apache.beam.sdk.util.UserCodeException.wrap(UserCodeException.java:36)
at org.apache.beam.sdk.io.gcp.spanner.SpannerIO$BatchFn$DoFnInvoker.invokeProcessElement(Unknown Source)
at org.apache.beam.runners.core.SimpleDoFnRunner.invokeProcessElement(SimpleDoFnRunner.java:185)
at org.apache.beam.runners.core.SimpleDoFnRunner.processElement(SimpleDoFnRunner.java:146)
at com.google.cloud.dataflow.worker.SimpleParDoFn.processElement(SimpleParDoFn.java:323)
at com.google.cloud.dataflow.worker.util.common.worker.ParDoOperation.process(ParDoOperation.java:43)
at com.google.cloud.dataflow.worker.util.common.worker.OutputReceiver.process(OutputReceiver.java:48)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowsParDoFn$1.output(GroupAlsoByWindowsParDoFn.java:181)
... 21 more
Caused by: java.lang.RuntimeException: java.io.EOFException
at org.apache.beam.sdk.io.gcp.spanner.MutationGroupEncoder.decode(MutationGroupEncoder.java:271)
at org.apache.beam.sdk.io.gcp.spanner.SpannerIO$BatchFn.processElement(SpannerIO.java:1030)
Caused by: java.io.EOFException
at java.io.DataInputStream.readFully(DataInputStream.java:197)
at java.io.DataInputStream.readFully(DataInputStream.java:169)
at org.apache.beam.sdk.io.gcp.spanner.MutationGroupEncoder.readBytes(MutationGroupEncoder.java:475)
at org.apache.beam.sdk.io.gcp.spanner.MutationGroupEncoder.decodePrimitive(MutationGroupEncoder.java:434)
at org.apache.beam.sdk.io.gcp.spanner.MutationGroupEncoder.decodeModification(MutationGroupEncoder.java:326)
at org.apache.beam.sdk.io.gcp.spanner.MutationGroupEncoder.decodeMutation(MutationGroupEncoder.java:280)
at org.apache.beam.sdk.io.gcp.spanner.MutationGroupEncoder.decode(MutationGroupEncoder.java:264)
at org.apache.beam.sdk.io.gcp.spanner.SpannerIO$BatchFn.processElement(SpannerIO.java:1030)
at org.apache.beam.sdk.io.gcp.spanner.SpannerIO$BatchFn$DoFnInvoker.invokeProcessElement(Unknown Source)
at org.apache.beam.runners.core.SimpleDoFnRunner.invokeProcessElement(SimpleDoFnRunner.java:185)
at org.apache.beam.runners.core.SimpleDoFnRunner.processElement(SimpleDoFnRunner.java:146)
at com.google.cloud.dataflow.worker.SimpleParDoFn.processElement(SimpleParDoFn.java:323)
at com.google.cloud.dataflow.worker.util.common.worker.ParDoOperation.process(ParDoOperation.java:43)
at com.google.cloud.dataflow.worker.util.common.worker.OutputReceiver.process(OutputReceiver.java:48)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowsParDoFn$1.output(GroupAlsoByWindowsParDoFn.java:181)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowFnRunner$1.outputWindowedValue(GroupAlsoByWindowFnRunner.java:102)
at com.google.cloud.dataflow.worker.util.BatchGroupAlsoByWindowViaIteratorsFn.processElement(BatchGroupAlsoByWindowViaIteratorsFn.java:124)
at com.google.cloud.dataflow.worker.util.BatchGroupAlsoByWindowViaIteratorsFn.processElement(BatchGroupAlsoByWindowViaIteratorsFn.java:53)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowFnRunner.invokeProcessElement(GroupAlsoByWindowFnRunner.java:115)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowFnRunner.processElement(GroupAlsoByWindowFnRunner.java:73)
at com.google.cloud.dataflow.worker.GroupAlsoByWindowsParDoFn.processElement(GroupAlsoByWindowsParDoFn.java:113)
at com.google.cloud.dataflow.worker.util.common.worker.ParDoOperation.process(ParDoOperation.java:43)
at com.google.cloud.dataflow.worker.util.common.worker.OutputReceiver.process(OutputReceiver.java:48)
at com.google.cloud.dataflow.worker.util.common.worker.ReadOperation.runReadLoop(ReadOperation.java:200)
at com.google.cloud.dataflow.worker.util.common.worker.ReadOperation.start(ReadOperation.java:158)
at com.google.cloud.dataflow.worker.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:75)
at com.google.cloud.dataflow.worker.BatchDataflowWorker.executeWork(BatchDataflowWorker.java:391)
at com.google.cloud.dataflow.worker.BatchDataflowWorker.doWork(BatchDataflowWorker.java:360)
at com.google.cloud.dataflow.worker.BatchDataflowWorker.getAndPerformWork(BatchDataflowWorker.java:288)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.doWork(DataflowBatchWorkerHarness.java:134)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:114)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:101)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
答案 0 :(得分:2)
有相同的问题,更改为2.6.0后已解决,并且能够上传100 GB(3.5亿条记录)。但是我发现与2.4.0相比,数据流作业速度很慢。