我有这个代码生成一个随机的df并将其作为镶嵌文件写入spark 2.1。当列数达到10000时会遇到问题,但对于10000,它似乎工作正常。
在100000列中,火花只会在屏幕上打印出一堆代码并引发错误,如下所示。
如何在没有错误的情况下将其写入镶木地板?
import org.apache.spark.sql.types.{StructType,StructField,IntegerType,DoubleType}
import org.apache.spark.ml.Pipeline
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import scala.util.Random
import scala.math
val nRows = 10000
val nCols = 100000
val rD = sc.parallelize(0 to nRows-1).map { _ => Row.fromSeq(Seq.fill(nCols)(math.ceil(1000*Random.nextDouble()))) }
val schema = StructType((0 to nCols-1).map { i => StructField("C" + i, DoubleType, true) } )
val df = spark.createDataFrame(rD, schema)
df.select("*").write.format("parquet").save("df.parquet")
/* 379357 */ private void apply_22702(InternalRow i) {
/* 379358 */
/* 379359 */
/* 379360 */ boolean isNull90808 = i.isNullAt(90808);
/* 379361 */ double value90808 = isNull90808 ? -1.0 : (i.getDouble(90808));
/* 379362 */ if (isNull90808) {
/* 379363 */ rowWriter.setNullAt(90808);
/* 379364 */ } else {
/* 379365 */ rowWriter.write(90808, value90808);
/* 379366 */ }
/* 379367 */
/* 379368 */
/* 379369 */ boolean isNull90809 = i.isNullAt(90809);
/* 379370 */ double value90809 = isNull90809 ? -1.0 : (i.getDouble(90809));
/* 379371 */ if (isNull90809) {
/* 379372 */ rowWriter.setNullAt(90809);
/* 379373 */ } else {
/* 379374 */ rowWriter.write(90809, value90809);
/* 379375 */ }
/* 379376 */
/* 379377 */
/* 379378 */ boolean isNull90810 = i.isNullAt(90810);
/* 379379 */ double value90810 = isNull90810 ? -1.0 : (i.getDouble(90810));
/* 379380 */ if (isNull90810) {
/* 379381 */ rowWriter.setNullAt(90810);
/* 379382 */ } else {
/* 379383 */ rowWriter.write(90810, value90810);
/* 379384 */ }
/* 379385 */
.
.
.
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:941)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:998)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:995)
at org.spark_project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
at org.spark_project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
at org.spark_project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
... 25 more
Caused by: org.codehaus.janino.JaninoRuntimeException: Constant pool for class org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection has grown past JVM limit of 0xFFFF
at org.codehaus.janino.util.ClassFile.addToConstantPool(ClassFile.java:499)
at org.codehaus.janino.util.ClassFile.addConstantIntegerInfo(ClassFile.java:395)
at org.codehaus.janino.UnitCompiler.addConstantIntegerInfo(UnitCompiler.java:11137)
at org.codehaus.janino.UnitCompiler.pushConstant(UnitCompiler.java:9681)
at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4911)
at org.codehaus.janino.UnitCompiler.access$7700(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler$12.visitIntegerLiteral(UnitCompiler.java:3776)
at org.codehaus.janino.UnitCompiler$12.visitIntegerLiteral(UnitCompiler.java:3762)
at org.codehaus.janino.Java$IntegerLiteral.accept(Java.java:4635)
at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
at org.codehaus.janino.UnitCompiler.fakeCompile(UnitCompiler.java:3128)
at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4927)
at org.codehaus.janino.UnitCompiler.compileGet2(UnitCompiler.java:4526)
at org.codehaus.janino.UnitCompiler.access$7500(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler$12.visitMethodInvocation(UnitCompiler.java:3774)
at org.codehaus.janino.UnitCompiler$12.visitMethodInvocation(UnitCompiler.java:3762)
at org.codehaus.janino.Java$MethodInvocation.accept(Java.java:4328)
at org.codehaus.janino.UnitCompiler.compileGet(UnitCompiler.java:3762)
at org.codehaus.janino.UnitCompiler.compileGetValue(UnitCompiler.java:4933)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:2330)
at org.codehaus.janino.UnitCompiler.access$2600(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler$6.visitLocalVariableDeclarationStatement(UnitCompiler.java:1386)
at org.codehaus.janino.UnitCompiler$6.visitLocalVariableDeclarationStatement(UnitCompiler.java:1370)
at org.codehaus.janino.Java$LocalVariableDeclarationStatement.accept(Java.java:2974)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:1370)
at org.codehaus.janino.UnitCompiler.compileStatements(UnitCompiler.java:1450)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:2811)
at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1262)
at org.codehaus.janino.UnitCompiler.compileDeclaredMethods(UnitCompiler.java:1234)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:538)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:890)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:894)
at org.codehaus.janino.UnitCompiler.access$600(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler$2.visitMemberClassDeclaration(UnitCompiler.java:377)
at org.codehaus.janino.UnitCompiler$2.visitMemberClassDeclaration(UnitCompiler.java:369)
at org.codehaus.janino.Java$MemberClassDeclaration.accept(Java.java:1128)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
at org.codehaus.janino.UnitCompiler.compileDeclaredMemberTypes(UnitCompiler.java:1209)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:564)
at org.codehaus.janino.UnitCompiler.compile2(UnitCompiler.java:420)
at org.codehaus.janino.UnitCompiler.access$400(UnitCompiler.java:206)
at org.codehaus.janino.UnitCompiler$2.visitPackageMemberClassDeclaration(UnitCompiler.java:374)
at org.codehaus.janino.UnitCompiler$2.visitPackageMemberClassDeclaration(UnitCompiler.java:369)
at org.codehaus.janino.Java$AbstractPackageMemberClassDeclaration.accept(Java.java:1309)
at org.codehaus.janino.UnitCompiler.compile(UnitCompiler.java:369)
at org.codehaus.janino.UnitCompiler.compileUnit(UnitCompiler.java:345)
at org.codehaus.janino.SimpleCompiler.compileToClassLoader(SimpleCompiler.java:396)
at org.codehaus.janino.ClassBodyEvaluator.compileToClass(ClassBodyEvaluator.java:311)
at org.codehaus.janino.ClassBodyEvaluator.cook(ClassBodyEvaluator.java:229)
at org.codehaus.janino.SimpleCompiler.cook(SimpleCompiler.java:196)
at org.codehaus.commons.compiler.Cookable.cook(Cookable.java:91)
at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$.org$apache$spark$sql$catalyst$expressions$codegen$CodeGenerator$$doCompile(CodeGenerator.scala:935)
... 30 more
答案 0 :(得分:2)
看起来像codegen超出64k方法限制的那些令人讨厌的问题之一(如JSON.NET和SPARK-18492中所述)。
您可能需要查看SPARK-16845中每晚的2.2.0-SNAPSHOT版本之一,看看是否可以在将来解决您的问题(版本何时发布)。