以下代码在与directRunner一起使用时运行,但与sparkRunner一起运行时。我正在尝试通过parquetIO编写一个Parquet文件。
18/08/28 11:56:51 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, eio4, executor 1): org.apache.beam.sdk.util.UserCodeException: java.lang.NoSuchMethodError: shaded.parquet.org.apache.thrift.EncodingUtils.setBit(BIZ)B
at org.apache.beam.sdk.util.UserCodeException.wrap(UserCodeException.java:36)
at org.apache.beam.sdk.io.WriteFiles$WriteUnshardedTempFilesWithSpillingFn$DoFnInvoker.invokeFinishBundle(Unknown Source)
at org.apache.beam.runners.core.SimpleDoFnRunner.finishBundle(SimpleDoFnRunner.java:195)
at org.apache.beam.runners.spark.translation.DoFnRunnerWithMetrics.finishBundle(DoFnRunnerWithMetrics.java:89)
at org.apache.beam.runners.spark.translation.SparkProcessContext$ProcCtxtIterator.computeNext(SparkProcessContext.java:154)
at org.apache.beam.repackaged.beam_runners_spark.com.google.common.collect.AbstractIterator.tryToComputeNext(AbstractIterator.java:145)
at org.apache.beam.repackaged.beam_runners_spark.com.google.common.collect.AbstractIterator.hasNext(AbstractIterator.java:140)
at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42)
at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1092)
at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1083)
at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1018)
at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1083)
at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:809)
at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
我的代码:
import java.io.IOException;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.beam.runners.spark.SparkRunner;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.io.FileIO;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.hdfs.HadoopFileSystemOptions;
import org.apache.beam.sdk.io.parquet.ParquetIO;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.security.UserGroupInformation;
public class CopyFile {
private static final org.apache.avro.Schema SCHEMA = new org.apache.avro.Schema.Parser().parse(
"{\n"
+ " \"namespace\": \"TestAvroLine\",\n"
+ " \"type\": \"record\",\n"
+ " \"name\": \"TestAvroLine\",\n"
+ " \"fields\": [\n"
+ " {\"name\": \"row1\", \"type\": \"string\"}\n"
+ " ]\n"
+ "}");
public static void main(String[] args) {
//HadoopFileSystemOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(HadoopFileSystemOptions.class);
PipelineOptions options = PipelineOptionsFactory.create().as(HadoopFileSystemOptions.class);
options.setRunner(SparkRunner.class);
Pipeline p = Pipeline.create(options);
Configuration conf = new Configuration();
//conf.set("fs.defaultFS",args[13]);
UserGroupInformation.setConfiguration(conf);
try {
UserGroupInformation.loginUserFromKeytab("shri@EIO.COM", "/opt/app/kerbfiles/shri.keytab");
if(UserGroupInformation.isLoginKeytabBased()){
UserGroupInformation.getLoginUser().reloginFromKeytab();
}else if(UserGroupInformation.isLoginTicketBased()){
UserGroupInformation.getLoginUser().reloginFromTicketCache();
}
}catch (IOException e1) {
e1.printStackTrace();
}
p.apply("ReadLines", TextIO.read().from("hdfs://nameservicetor/hdfsdata/input/prices_1.txt"))
.apply( ParDo.of(new DeterministicallyConstructAvroRecordsFn()))
.setCoder(AvroCoder.of(SCHEMA))
.apply(
FileIO.<GenericRecord>write()
.via(ParquetIO.sink(SCHEMA))
.to("file:/home/icedq/icestore/data/domain/test"));
p.run().waitUntilFinish();
}
private static class DeterministicallyConstructAvroRecordsFn extends DoFn<String, GenericRecord> {
@ProcessElement
public void processElement(ProcessContext c) {
c.output(
new GenericRecordBuilder(SCHEMA).set("row1", c.element()).build()
);
}
}
}
执行此程序后,出现以下错误。 让我知道我是否想念一些东西。
***18/08/28 11:56:53 INFO cluster.YarnClientSchedulerBackend: Stopped
18/08/28 11:56:53 INFO spark.MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
18/08/28 11:56:53 INFO memory.MemoryStore: MemoryStore cleared
18/08/28 11:56:53 INFO storage.BlockManager: BlockManager stopped
18/08/28 11:56:53 INFO storage.BlockManagerMaster: BlockManagerMaster stopped
18/08/28 11:56:53 INFO scheduler.OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
18/08/28 11:56:53 INFO spark.SparkContext: Successfully stopped SparkContext
Exception in thread "main" org.apache.beam.sdk.Pipeline$PipelineExecutionException: java.lang.NoSuchMethodError: shaded.parquet.org.apache.thrift.EncodingUtils.setBit(BIZ)B
at org.apache.beam.runners.spark.SparkPipelineResult.beamExceptionFrom(SparkPipelineResult.java:66)
at org.apache.beam.runners.spark.SparkPipelineResult.waitUntilFinish(SparkPipelineResult.java:99)
at org.apache.beam.runners.spark.SparkPipelineResult.waitUntilFinish(SparkPipelineResult.java:87)
at CopyFile.main(CopyFile.java:104)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:892)
at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:197)
at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:227)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:136)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.NoSuchMethodError: shaded.parquet.org.apache.thrift.EncodingUtils.setBit(BIZ)B
at org.apache.parquet.format.PageHeader.setUncompressed_page_sizeIsSet(PageHeader.java:325)
at org.apache.parquet.format.PageHeader.<init>(PageHeader.java:216)
at org.apache.parquet.format.converter.ParquetMetadataConverter.newDataPageHeader(ParquetMetadataConverter.java:1071)
at org.apache.parquet.format.converter.ParquetMetadataConverter.writeDataPageHeader(ParquetMetadataConverter.java:1059)
at org.apache.parquet.hadoop.ColumnChunkPageWriteStore$ColumnChunkPageWriter.writePage(ColumnChunkPageWriteStore.java:103)
at org.apache.parquet.column.impl.ColumnWriterV1.writePage(ColumnWriterV1.java:147)
at org.apache.parquet.column.impl.ColumnWriterV1.flush(ColumnWriterV1.java:235)
at org.apache.parquet.column.impl.ColumnWriteStoreV1.flush(ColumnWriteStoreV1.java:122)
at org.apache.parquet.hadoop.InternalParquetRecordWriter.flushRowGroupToStore(InternalParquetRecordWriter.java:172)
at org.apache.parquet.hadoop.InternalParquetRecordWriter.close(InternalParquetRecordWriter.java:114)
at org.apache.parquet.hadoop.ParquetWriter.close(ParquetWriter.java:308)
at org.apache.beam.sdk.io.parquet.ParquetIO$Sink.flush(ParquetIO.java:312)
at org.apache.beam.sdk.io.FileIO$Write$ViaFileBasedSink$1$1.finishWrite(FileIO.java:1347)
at org.apache.beam.sdk.io.FileBasedSink$Writer.close(FileBasedSink.java:991)
at org.apache.beam.sdk.io.WriteFiles$WriteUnshardedTempFilesWithSpillingFn.finishBundle(WriteFiles.java:531)***