我想使用Dataframe / Dataset的except / intersection方法,但不幸的是我发现如果数据集或数据框包含规范的case类,操作将失败。我还注意到这两种方法都包含:
@note直接在编码上执行等式检查 数据*的表示,因此不受自定义的影响 在
上定义的equals
T
函数
任何建议什么是最好的方法?
示例代码:
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.immutable.Seq
case class TestString(str : String)
case class CanonicalExample(tString : TestString)
object GroupOperations {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName("GroupOperations").master("local[2]").enableHiveSupport.getOrCreate
import spark.implicits._
val list1 = List[TestString](new TestString("a"), new TestString("b"), new TestString("c"))
val list2 = List[TestString](new TestString("d"), new TestString("e"), new TestString("c"))
val df1: DataFrame = spark.sqlContext.createDataFrame[TestString](list1)
val df2: DataFrame = spark.sqlContext.createDataFrame[TestString](list2)
//This is working !!!
df1.except(df2).show()
val list3: Seq[CanonicalExample] = list1.map(t => new CanonicalExample(t))
val list4: Seq[CanonicalExample] = list2.map(t => new CanonicalExample(t))
val df3: DataFrame = spark.sqlContext.createDataFrame[CanonicalExample](list3)
val df4: DataFrame = spark.sqlContext.createDataFrame[CanonicalExample](list4)
//This Will fail !!
df3.intersect(df4).show()
}
}
例外:
16:55:24 [main] [CodeGenerator.logError]: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 253, Column 34: Expression "inputadapter_isNull" is not an rvalue
/* 001 */ public Object generate(Object[] references) {
/* 002 */ return new GeneratedIterator(references);
/* 003 */ }
/* 004 */
/* 005 */ final class GeneratedIterator extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 006 */ private Object[] references;
/* 007 */ private scala.collection.Iterator[] inputs;
/* 008 */ private boolean agg_initAgg;
/* 009 */ private org.apache.spark.sql.execution.aggregate.HashAggregateExec agg_plan;
/* 010 */ private org.apache.spark.sql.execution.UnsafeFixedWidthAggregationMap agg_hashMap;
/* 011 */ private org.apache.spark.sql.execution.UnsafeKVExternalSorter agg_sorter;
/* 012 */ private org.apache.spark.unsafe.KVIterator agg_mapIter;
/* 013 */ private org.apache.spark.sql.execution.metric.SQLMetric agg_peakMemory;
/* 014 */ private org.apache.spark.sql.execution.metric.SQLMetric agg_spillSize;