为什么保存到超过10000列的镶木地板文件会导致JaninoRuntimeException?

时间:2017-04-05 15:41:12

标签: apache-spark apache-spark-sql parquet

我有这个代码生成一个随机的df并将其作为镶嵌文件写入spark 2.1。当列数达到10000时会遇到问题,但对于10000,它似乎工作正常。

在100000列中,火花只会在屏幕上打印出一堆代码并引发错误,如下所示。

如何在没有错误的情况下将其写入镶木地板?

import org.apache.spark.sql.types.{StructType,StructField,IntegerType,DoubleType}
import org.apache.spark.ml.Pipeline
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import scala.util.Random
import scala.math

val nRows = 10000
val nCols = 100000
val rD = sc.parallelize(0 to nRows-1).map { _ => Row.fromSeq(Seq.fill(nCols)(math.ceil(1000*Random.nextDouble()))) }

val schema = StructType((0 to nCols-1).map { i => StructField("C" + i, DoubleType, true) } )
val df = spark.createDataFrame(rD, schema)
df.select("*").write.format("parquet").save("df.parquet")  

/* 379357 */   private void apply_22702(InternalRow i) {
/* 379358 */
/* 379359 */
/* 379360 */     boolean isNull90808 = i.isNullAt(90808);
/* 379361 */     double value90808 = isNull90808 ? -1.0 :  (i.getDouble(90808));
/* 379362 */     if (isNull90808) {
/* 379363 */       rowWriter.setNullAt(90808);
/* 379364 */     } else { 
/* 379365 */       rowWriter.write(90808, value90808);
/* 379366 */     }
/* 379367 */
/* 379368 */
/* 379369 */     boolean isNull90809 = i.isNullAt(90809);  
/* 379370 */     double value90809 = isNull90809 ? -1.0 : (i.getDouble(90809));
/* 379371 */     if (isNull90809) {
/* 379372 */       rowWriter.setNullAt(90809);
/* 379373 */     } else {
/* 379374 */       rowWriter.write(90809, value90809);
/* 379375 */     }
/* 379376 */
/* 379377 */
/* 379378 */     boolean isNull90810 = i.isNullAt(90810);
/* 379379 */     double value90810 = isNull90810 ? -1.0 : (i.getDouble(90810)); 
/* 379380 */     if (isNull90810) {
/* 379381 */       rowWriter.setNullAt(90810);
/* 379382 */     } else {
/* 379383 */       rowWriter.write(90810, value90810);
/* 379384 */     }
/* 379385 */
.
.
.
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:941)
    at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:998)
    at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:995)
    at org.spark_project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
    at org.spark_project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
    at org.spark_project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
    ... 25 more
Caused by: org.codehaus.janino.JaninoRuntimeException: Constant pool for class org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection has grown past JVM limit of 0xFFFF
    at org.codehaus.janino.util.ClassFile.addToConstantPool(ClassFile.java:499)
    at org.codehaus.janino.util.ClassFile.addConstantIntegerInfo(ClassFile.java:395)
    at org.codehaus.janino.UnitCompiler.addConstantIntegerInfo(UnitCompiler.java:11137)
    at org.codehaus.janino.UnitCompiler.pushConstant(UnitCompiler.java:9681)
    at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4911)
    at org.codehaus.janino.UnitCompiler.access$7700(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$12.visitIntegerLiteral(UnitCompiler.java:3776)
    at org.codehaus.janino.UnitCompiler$12.visitIntegerLiteral(UnitCompiler.java:3762)
    at org.codehaus.janino.Java$IntegerLiteral.accept(Java.java:4635)
    at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
    at org.codehaus.janino.UnitCompiler.fakeCompile(UnitCompiler.java:3128)
    at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4927)
    at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4526)
    at org.codehaus.janino.UnitCompiler.access$7500(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$12.visitMethodInvocation(UnitCompiler.java:3774)
    at org.codehaus.janino.UnitCompiler$12.visitMethodInvocation(UnitCompiler.java:3762)
    at org.codehaus.janino.Java$MethodInvocation.accept(Java.java:4328)
    at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
    at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4933)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:2330)
    at org.codehaus.janino.UnitCompiler.access$2600(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$6.visitLocalVariableDeclarationStatement(UnitCompiler.java:1386)
    at org.codehaus.janino.UnitCompiler$6.visitLocalVariableDeclarationStatement(UnitCompiler.java:1370)
    at org.codehaus.janino.Java$LocalVariableDeclarationStatement.accept(Java.java:2974)
    at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:1370)
    at org.codehaus.janino.UnitCompiler.compileStatements(UnitCompiler.java:1450)
    at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:2811)
    at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1262)
    at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1234)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:538)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:890)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:894)
    at org.codehaus.janino.UnitCompiler.access$600(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$2.visitMemberClassDeclaration(UnitCompiler.java:377)
    at org.codehaus.janino.UnitCompiler$2.visitMemberClassDeclaration(UnitCompiler.java:369)
    at org.codehaus.janino.Java$MemberClassDeclaration.accept(Java.java:1128)
    at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
    at org.codehaus.janino.UnitCompiler.compileDeclaredMemberTypes(UnitCompiler.java:1209)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:564)
    at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:420)
    at org.codehaus.janino.UnitCompiler.access$400(UnitCompiler.java:206)
    at org.codehaus.janino.UnitCompiler$2.visitPackageMemberClassDeclaration(UnitCompiler.java:374)
    at org.codehaus.janino.UnitCompiler$2.visitPackageMemberClassDeclaration(UnitCompiler.java:369)
    at org.codehaus.janino.Java$AbstractPackageMemberClassDeclaration.accept(Java.java:1309)
    at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
    at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:345)
    at org.codehaus.janino.SimpleCompiler.compileToClassLoader(SimpleCompiler.java:396)
    at org.codehaus.janino.ClassBodyEvaluator.compileToClass(ClassBodyEvaluator.java:311)
    at org.codehaus.janino.ClassBodyEvaluator.cook(ClassBodyEvaluator.java:229)
    at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:196)
    at org.codehaus.commons.compiler.Cookable.cook(Cookable.java:91)
    at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:935)
    ... 30 more

1 个答案:

答案 0 :(得分:2)

看起来像codegen超出64k方法限制的那些令人讨厌的问题之一(如JSON.NETSPARK-18492中所述)。

您可能需要查看SPARK-16845中每晚的2.2.0-SNAPSHOT版本之一,看看是否可以在将来解决您的问题(版本何时发布)。