使用Apache Beam的ParquetIO Writer错误

时间:2018-08-28 16:14:37

标签: java google-cloud-dataflow apache-beam

以下代码在与directRunner一起使用时运行,但与sparkRunner一起运行时。我正在尝试通过parquetIO编写一个Parquet文件。

18/08/28 11:56:51 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, eio4, executor 1): org.apache.beam.sdk.util.UserCodeException: java.lang.NoSuchMethodError: shaded.parquet.org.apache.thrift.EncodingUtils.setBit(BIZ)B
        at org.apache.beam.sdk.util.UserCodeException.wrap(UserCodeException.java:36)
        at org.apache.beam.sdk.io.WriteFiles$WriteUnshardedTempFilesWithSpillingFn$DoFnInvoker.invokeFinishBundle(Unknown Source)
        at org.apache.beam.runners.core.SimpleDoFnRunner.finishBundle(SimpleDoFnRunner.java:195)
        at org.apache.beam.runners.spark.translation.DoFnRunnerWithMetrics.finishBundle(DoFnRunnerWithMetrics.java:89)
        at org.apache.beam.runners.spark.translation.SparkProcessContext$ProcCtxtIterator.computeNext(SparkProcessContext.java:154)
        at org.apache.beam.repackaged.beam_runners_spark.com.google.common.collect.AbstractIterator.tryToComputeNext(AbstractIterator.java:145)
        at org.apache.beam.repackaged.beam_runners_spark.com.google.common.collect.AbstractIterator.hasNext(AbstractIterator.java:140)
        at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42)
        at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
        at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1092)
        at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1083)
        at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1018)
        at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1083)
        at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:809)
        at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
        at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
        at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
        at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
        at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
        at org.apache.spark.scheduler.Task.run(Task.scala:109)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
        at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
        at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
        at java.lang.Thread.run(Thread.java:748)

我的代码:

import java.io.IOException;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.beam.runners.spark.SparkRunner;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.io.FileIO;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.hdfs.HadoopFileSystemOptions;
import org.apache.beam.sdk.io.parquet.ParquetIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;

public class CopyFile {
    private static final org.apache.avro.Schema SCHEMA = new org.apache.avro.Schema.Parser().parse(
            "{\n"
                    + " \"namespace\": \"TestAvroLine\",\n"
                    + " \"type\": \"record\",\n"
                    + " \"name\": \"TestAvroLine\",\n"
                    + " \"fields\": [\n"
                    + "     {\"name\": \"row1\", \"type\": \"string\"}\n"

                    + " ]\n"
                    + "}");

    public static void main(String[] args) {

        //HadoopFileSystemOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(HadoopFileSystemOptions.class);
        PipelineOptions options = PipelineOptionsFactory.create().as(HadoopFileSystemOptions.class);
        options.setRunner(SparkRunner.class);

        Pipeline p = Pipeline.create(options);
        Configuration conf = new Configuration();
        //conf.set("fs.defaultFS",args[13]);
        UserGroupInformation.setConfiguration(conf);
        try {
            UserGroupInformation.loginUserFromKeytab("shri@EIO.COM", "/opt/app/kerbfiles/shri.keytab");
            if(UserGroupInformation.isLoginKeytabBased()){
                UserGroupInformation.getLoginUser().reloginFromKeytab();
            }else if(UserGroupInformation.isLoginTicketBased()){
                UserGroupInformation.getLoginUser().reloginFromTicketCache();
            }
        }catch (IOException e1) {
            e1.printStackTrace();
        }
        p.apply("ReadLines", TextIO.read().from("hdfs://nameservicetor/hdfsdata/input/prices_1.txt"))
        .apply( ParDo.of(new DeterministicallyConstructAvroRecordsFn()))
        .setCoder(AvroCoder.of(SCHEMA))
        .apply(
                FileIO.<GenericRecord>write()
                .via(ParquetIO.sink(SCHEMA))
                .to("file:/home/icedq/icestore/data/domain/test"));

        p.run().waitUntilFinish();

    }
     private static class DeterministicallyConstructAvroRecordsFn extends DoFn<String, GenericRecord> {
            @ProcessElement
            public void processElement(ProcessContext c) {
              c.output(
                  new GenericRecordBuilder(SCHEMA).set("row1", c.element()).build()
              );
            }
          }

}

执行此程序后,出现以下错误。 让我知道我是否想念一些东西。

***18/08/28 11:56:53 INFO cluster.YarnClientSchedulerBackend: Stopped
18/08/28 11:56:53 INFO spark.MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
18/08/28 11:56:53 INFO memory.MemoryStore: MemoryStore cleared
18/08/28 11:56:53 INFO storage.BlockManager: BlockManager stopped
18/08/28 11:56:53 INFO storage.BlockManagerMaster: BlockManagerMaster stopped
18/08/28 11:56:53 INFO scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
18/08/28 11:56:53 INFO spark.SparkContext: Successfully stopped SparkContext
Exception in thread "main" org.apache.beam.sdk.Pipeline$PipelineExecutionException: java.lang.NoSuchMethodError: shaded.parquet.org.apache.thrift.EncodingUtils.setBit(BIZ)B
        at org.apache.beam.runners.spark.SparkPipelineResult.beamExceptionFrom(SparkPipelineResult.java:66)
        at org.apache.beam.runners.spark.SparkPipelineResult.waitUntilFinish(SparkPipelineResult.java:99)
        at org.apache.beam.runners.spark.SparkPipelineResult.waitUntilFinish(SparkPipelineResult.java:87)
        at CopyFile.main(CopyFile.java:104)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
        at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:892)
        at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:197)
        at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:227)
        at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:136)
        at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.NoSuchMethodError: shaded.parquet.org.apache.thrift.EncodingUtils.setBit(BIZ)B
        at org.apache.parquet.format.PageHeader.setUncompressed_page_sizeIsSet(PageHeader.java:325)
        at org.apache.parquet.format.PageHeader.<init>(PageHeader.java:216)
        at org.apache.parquet.format.converter.ParquetMetadataConverter.newDataPageHeader(ParquetMetadataConverter.java:1071)
        at org.apache.parquet.format.converter.ParquetMetadataConverter.writeDataPageHeader(ParquetMetadataConverter.java:1059)
        at org.apache.parquet.hadoop.ColumnChunkPageWriteStore$ColumnChunkPageWriter.writePage(ColumnChunkPageWriteStore.java:103)
        at org.apache.parquet.column.impl.ColumnWriterV1.writePage(ColumnWriterV1.java:147)
        at org.apache.parquet.column.impl.ColumnWriterV1.flush(ColumnWriterV1.java:235)
        at org.apache.parquet.column.impl.ColumnWriteStoreV1.flush(ColumnWriteStoreV1.java:122)
        at org.apache.parquet.hadoop.InternalParquetRecordWriter.flushRowGroupToStore(InternalParquetRecordWriter.java:172)
        at org.apache.parquet.hadoop.InternalParquetRecordWriter.close(InternalParquetRecordWriter.java:114)
        at org.apache.parquet.hadoop.ParquetWriter.close(ParquetWriter.java:308)
        at org.apache.beam.sdk.io.parquet.ParquetIO$Sink.flush(ParquetIO.java:312)
        at org.apache.beam.sdk.io.FileIO$Write$ViaFileBasedSink$1$1.finishWrite(FileIO.java:1347)
        at org.apache.beam.sdk.io.FileBasedSink$Writer.close(FileBasedSink.java:991)
        at org.apache.beam.sdk.io.WriteFiles$WriteUnshardedTempFilesWithSpillingFn.finishBundle(WriteFiles.java:531)***

0 个答案:

没有答案