我使用dataflowRunner运行apache-beam管道。在内部管道中,我可以从csv文件向bigquery表写入适量的数据。我使用json文件使用ServiceAccountCredentials。对于更大的数据,我得到以下异常,这表示代码无法获取刷新令牌。
Caused by: java.lang.NullPointerException
at com.google.auth.oauth2.ServiceAccountCredentials.refreshAccessToken(ServiceAccountCredentials.java:342)
at com.google.auth.oauth2.OAuth2Credentials.refresh(OAuth2Credentials.java:149)
at com.google.auth.oauth2.OAuth2Credentials.getRequestMetadata(OAuth2Credentials.java:135)
at com.google.auth.http.HttpCredentialsAdapter.initialize(HttpCredentialsAdapter.java:96)
at com.google.cloud.http.HttpTransportOptions$1.initialize(HttpTransportOptions.java:157)
at com.google.api.client.http.HttpRequestFactory.buildRequest(HttpRequestFactory.java:93)
at com.google.api.client.http.HttpRequestFactory.buildPostRequest(HttpRequestFactory.java:133)
at com.google.cloud.bigquery.spi.v2.HttpBigQueryRpc.open(HttpBigQueryRpc.java:403)
at com.google.cloud.bigquery.TableDataWriteChannel$2.call(TableDataWriteChannel.java:78)
at com.google.cloud.bigquery.TableDataWriteChannel$2.call(TableDataWriteChannel.java:75)
at com.google.api.gax.retrying.DirectRetryingExecutor.submit(DirectRetryingExecutor.java:91)
at com.google.cloud.RetryHelper.run(RetryHelper.java:74)
at com.google.cloud.RetryHelper.runWithRetries(RetryHelper.java:51)
at com.google.cloud.bigquery.TableDataWriteChannel.open(TableDataWriteChannel.java:75)
at com.google.cloud.bigquery.TableDataWriteChannel.<init>(TableDataWriteChannel.java:42)
at com.google.cloud.bigquery.BigQueryImpl.writer(BigQueryImpl.java:677)
at com.datametica.bigsuite.web.service.pelican.transfer.transforms.DataTransferTransform.processElement(DataTransferTransform.java:74)
at com.datametica.bigsuite.web.service.pelican.transfer.transforms.DataTransferTransform$DoFnInvoker.invokeProcessElement(Unknown Source)
at org.apache.beam.runners.core.SimpleDoFnRunner.invokeProcessElement(SimpleDoFnRunner.java:177)
at org.apache.beam.runners.core.SimpleDoFnRunner.processElement(SimpleDoFnRunner.java:138)
at com.google.cloud.dataflow.worker.SimpleParDoFn.processElement(SimpleParDoFn.java:324)
at com.google.cloud.dataflow.worker.util.common.worker.ParDoOperation.process(ParDoOperation.java:48)
at com.google.cloud.dataflow.worker.util.common.worker.OutputReceiver.process(OutputReceiver.java:52)
at com.google.cloud.dataflow.worker.SimpleParDoFn$1.output(SimpleParDoFn.java:272)
at org.apache.beam.runners.core.SimpleDoFnRunner.outputWindowedValue(SimpleDoFnRunner.java:211)
at org.apache.beam.runners.core.SimpleDoFnRunner.access$700(SimpleDoFnRunner.java:66)
at org.apache.beam.runners.core.SimpleDoFnRunner$DoFnProcessContext.output(SimpleDoFnRunner.java:436)
at org.apache.beam.runners.core.SimpleDoFnRunner$DoFnProcessContext.output(SimpleDoFnRunner.java:424)
at com.datametica.bigsuite.web.service.pelican.transfer.transforms.TransferCommandHolderDoFn.processElement(TransferCommandHolderDoFn.java:32)
at com.datametica.bigsuite.web.service.pelican.transfer.transforms.TransferCommandHolderDoFn$DoFnInvoker.invokeProcessElement(Unknown Source)
at org.apache.beam.runners.core.SimpleDoFnRunner.invokeProcessElement(SimpleDoFnRunner.java:177)
at org.apache.beam.runners.core.SimpleDoFnRunner.processElement(SimpleDoFnRunner.java:138)
at com.google.cloud.dataflow.worker.SimpleParDoFn.processElement(SimpleParDoFn.java:324)
at com.google.cloud.dataflow.worker.util.common.worker.ParDoOperation.process(ParDoOperation.java:48)
at com.google.cloud.dataflow.worker.util.common.worker.OutputReceiver.process(OutputReceiver.java:52)
at com.google.cloud.dataflow.worker.util.common.worker.ReadOperation.runReadLoop(ReadOperation.java:187)
at com.google.cloud.dataflow.worker.util.common.worker.ReadOperation.start(ReadOperation.java:148)
at com.google.cloud.dataflow.worker.util.common.worker.MapTaskExecutor.execute(MapTaskExecutor.java:68)
at com.google.cloud.dataflow.worker.DataflowWorker.executeWork(DataflowWorker.java:330)
at com.google.cloud.dataflow.worker.DataflowWorker.doWork(DataflowWorker.java:302)
at com.google.cloud.dataflow.worker.DataflowWorker.getAndPerformWork(DataflowWorker.java:251)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.doWork(DataflowBatchWorkerHarness.java:135)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:115)
at com.google.cloud.dataflow.worker.DataflowBatchWorkerHarness$WorkerThread.call(DataflowBatchWorkerHarness.java:102)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
下面是示例代码。
bigquery = PipelineLauncher.createBigQueryClient(credentials, projectId, configuration);
TableId tableId = TableId.of(transferCommandHolder.getValue().getTargetDbName(), transferCommandHolder.getValue().getTableName());
WriteChannelConfiguration writeChannelConfiguration = WriteChannelConfiguration.newBuilder(tableId)
.setFormatOptions(FormatOptions.csv())
.setSchema(transferCommandHolder.getValue().getBigQueryTargetSchema())
.setWriteDisposition(JobInfo.WriteDisposition.WRITE_APPEND)
.build();
TableDataWriteChannel writer = bigquery.writer(writeChannelConfiguration);
Path csvPath = FileSystems.getDefault().getPath(".", transferCommandHolder.getKey());
try (OutputStream stream = Channels.newOutputStream(writer)) {
Files.copy(csvPath, stream);
}
Job job = writer.getJob();
job = job.waitFor();
c.output(KV.of(transferCommandHolder.getKey(), null != job.getStatus().getError()));