我编写了一个函数,该函数使用一些公共列连接数据帧列表。下面是代码:
def joinByColumn(dfs: List[DataFrame], column: String): DataFrame = {
//check that all dfs contain the required column
require(dfs.map(_.columns).forall(_.contains(column)))
dfs.reduce((df1, df2) => df1.join(df2, Seq(column), "full_outer"))
}
我为此功能编写了一个测试案例,该案例适用于columnNum的较小值(假设为4),但是当我使用较大的值(如200)时,则会引发堆栈溢出错误。
test("complicated") {
val base = sqlContext.createDataFrame(
Seq(
(1, 1)
)
).toDF("key", "a")
val columnNum = 200
val dfs = (1 to columnNum)
.map(i => base.toDF("key", s"a$i"))
.toList
val actual = Ex4.joinByColumn(dfs, "key")
actual.explain()
val row = Row.fromSeq(Seq.fill(columnNum + 1)(1))
val rdd = sc.parallelize(Seq(row))
val columns = "key" :: (1 to columnNum).map(i => s"a$i").toList
val schema = StructType(columns.map(c => StructField(c, IntegerType)))
val expected = sqlContext.createDataFrame(rdd, schema)
expected should beEqualTo(actual)
}
PFB堆栈跟踪:
java.lang.StackOverflowError was thrown.
java.lang.StackOverflowError
at com.fasterxml.jackson.databind.deser.impl.PropertyValueBuffer._findMissing(PropertyValueBuffer.java:134)
at com.fasterxml.jackson.databind.deser.impl.PropertyValueBuffer.getParameters(PropertyValueBuffer.java:118)
at com.fasterxml.jackson.databind.deser.impl.PropertyBasedCreator.build(PropertyBasedCreator.java:136)
at com.fasterxml.jackson.databind.deser.BeanDeserializer._deserializeUsingPropertyBased(BeanDeserializer.java:442)
at com.fasterxml.jackson.databind.deser.BeanDeserializerBase.deserializeFromObjectUsingNonDefault(BeanDeserializerBase.java:1099)
at com.fasterxml.jackson.databind.deser.BeanDeserializer.deserializeFromObject(BeanDeserializer.java:296)
at com.fasterxml.jackson.databind.deser.BeanDeserializer.deserialize(BeanDeserializer.java:133)
at com.fasterxml.jackson.databind.ObjectMapper._readMapAndClose(ObjectMapper.java:3736)
at com.fasterxml.jackson.databind.ObjectMapper.readValue(ObjectMapper.java:2726)
at org.apache.spark.rdd.RDDOperationScope$.fromJson(RDDOperationScope.scala:86)
at org.apache.spark.rdd.RDDOperationScope$$anonfun$5.apply(RDDOperationScope.scala:137)
at org.apache.spark.rdd.RDDOperationScope$$anonfun$5.apply(RDDOperationScope.scala:137)
at scala.Option.map(Option.scala:146)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:137)
at org.apache.spark.rdd.RDD.doCheckpoint(RDD.scala:1755)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply$mcV$sp(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDD.doCheckpoint(RDD.scala:1755)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply$mcV$sp(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDD.doCheckpoint(RDD.scala:1755)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply$mcV$sp(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDD.doCheckpoint(RDD.scala:1755)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply$mcV$sp(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDD.doCheckpoint(RDD.scala:1755)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply$mcV$sp(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDD.doCheckpoint(RDD.scala:1755)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply$mcV$sp(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDD.doCheckpoint(RDD.scala:1755)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1$$anonfun$apply$mcV$sp$2.apply(RDD.scala:1768)
at scala.collection.immutable.List.foreach(List.scala:392)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply$mcV$sp(RDD.scala:1768)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
at org.apache.spark.rdd.RDD$$anonfun$doCheckpoint$1.apply(RDD.scala:1756)
...........
有人可以帮助我找到根本原因吗? 在本示例中,我们如何解决此问题?有没有更好的方法来加入庞大的数据帧列表?
答案 0 :(得分:0)
我能够使用local-checkpointing解决此问题:
def joinByColumn(dfs: List[DataFrame], column: String): DataFrame = {
//check that all dfs contain the required column
require(dfs.map(_.columns).forall(_.contains(column)))
dfs.reduce((df1, df2) =>
df1.join(df2, Seq(column), "full_outer").localCheckpoint(true)
)
}