我正在用Spark处理ORC Hive表,其中一个大json文件存储为该表中的列之一(最大40MB)。但是,即使经过以下设置,它也无法读取表格:
set orc.compress.size=4096
set hive.exec.orc.default.stripe.size=268435456
org.apache.hive.com.google.protobuf.InvalidProtocolBufferException: Protocol message was too large. May be malicious. Use CodedInputStream.setSizeLimit() to increase the size limit.
at org.apache.hive.com.google.protobuf.InvalidProtocolBufferException.sizeLimitExceeded(InvalidProtocolBufferException.java:110)
at org.apache.hive.com.google.protobuf.CodedInputStream.refillBuffer(CodedInputStream.java:755)
at org.apache.hive.com.google.protobuf.CodedInputStream.isAtEnd(CodedInputStream.java:701)
at org.apache.hive.com.google.protobuf.CodedInputStream.readTag(CodedInputStream.java:99)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$StringStatistics.<init>(OrcProto.java:1317)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$StringStatistics.<init>(OrcProto.java:1281)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$StringStatistics$1.parsePartialFrom(OrcProto.java:1374)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$StringStatistics$1.parsePartialFrom(OrcProto.java:1369)
at org.apache.hive.com.google.protobuf.CodedInputStream.readMessage(CodedInputStream.java:309)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$ColumnStatistics.<init>(OrcProto.java:4897)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$ColumnStatistics.<init>(OrcProto.java:4813)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$ColumnStatistics$1.parsePartialFrom(OrcProto.java:5005)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$ColumnStatistics$1.parsePartialFrom(OrcProto.java:5000)
at org.apache.hive.com.google.protobuf.CodedInputStream.readMessage(CodedInputStream.java:309)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$StripeStatistics.<init>(OrcProto.java:14334)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$StripeStatistics.<init>(OrcProto.java:14281)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$StripeStatistics$1.parsePartialFrom(OrcProto.java:14370)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$StripeStatistics$1.parsePartialFrom(OrcProto.java:14365)
at org.apache.hive.com.google.protobuf.CodedInputStream.readMessage(CodedInputStream.java:309)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$Metadata.<init>(OrcProto.java:15008)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$Metadata.<init>(OrcProto.java:14955)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$Metadata$1.parsePartialFrom(OrcProto.java:15044)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$Metadata$1.parsePartialFrom(OrcProto.java:15039)
at org.apache.hive.com.google.protobuf.AbstractParser.parsePartialFrom(AbstractParser.java:200)
at org.apache.hive.com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:217)
at org.apache.hive.com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:223)
at org.apache.hive.com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:49)
at org.apache.hadoop.hive.ql.io.orc.OrcProto$Metadata.parseFrom(OrcProto.java:15155)
at org.apache.hadoop.hive.ql.io.orc.ReaderImpl$MetaInfoObjExtractor.<init>(ReaderImpl.java:473)
at org.apache.hadoop.hive.ql.io.orc.ReaderImpl.<init>(ReaderImpl.java:319)
at org.apache.hadoop.hive.ql.io.orc.OrcFile.createReader(OrcFile.java:237)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getReader(OrcInputFormat.java:1261)
at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getRecordReader(OrcInputFormat.java:1170)
at org.apache.spark.rdd.HadoopRDD$$anon$1.liftedTree1$1(HadoopRDD.scala:257)
at org.apache.spark.rdd.HadoopRDD$$anon$1.<init>(HadoopRDD.scala:256)
at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:214)
at org.apache.spark.rdd.HadoopRDD.compute(HadoopRDD.scala:94)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.UnionRDD.compute(UnionRDD.scala:105)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Driver stacktrace: