我有以下情况:
我有2个数据帧,只包含1列Lets say
DF1=(1,2,3,4,5)
DF2=(3,6,7,8,9,10)
基本上这些值是键,如果DF1中的键不在DF2中,我正在创建DF1的镶木地板文件(在当前示例中,它应该返回false)。我目前实现我的要求的方法是:
val df1count= DF1.count
val df2count=DF2.count
val diffDF=DF2.except(DF1)
val diffCount=diffDF.count
if(diffCount==(df2count-df1count)) true
else false
这种方法的问题是我将动作元素调用了4次,这肯定不是最好的方法。有人可以建议我实现这个目标的最有效方法吗?
答案 0 :(得分:1)
您可以使用以下功能:
import org.apache.spark.sql.functions._
def diff(key: String, df1: DataFrame, df2: DataFrame): DataFrame = {
val fields = df1.schema.fields.map(_.name)
val diffColumnName = "Diff"
df1
.join(df2, df1(key) === df2(key), "full_outer")
.withColumn(
diffColumnName,
when(df1(key).isNull, "New row in DataFrame 2")
.otherwise(
when(df2(key).isNull, "New row in DataFrame 1")
.otherwise(
concat_ws("",
fields.map(f => when(df1(f) =!= df2(f), s"$f ").otherwise("")):_*
)
)
)
)
.filter(col(diffColumnName) =!= "")
.select(
fields.map(f =>
when(df1(key).isNotNull, df1(f)).otherwise(df2(f)).alias(f)
) :+ col(diffColumnName):_*
)
}
在你的情况下运行:
diff("emp_id", df1, df2)
示例强>
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
object DiffDataFrames extends App {
val session = SparkSession.builder().master("local").getOrCreate()
import session.implicits._
val df1 = session.createDataset(Seq((1,"a",11),(2,"b",2),(3,"c",33),(5,"e",5))).toDF("n", "s", "i")
val df2 = session.createDataset(Seq((1,"a",11),(2,"bb",2),(3,"cc",34),(4,"d",4))).toDF("n", "s", "i")
def diff(key: String, df1: DataFrame, df2: DataFrame): DataFrame =
/* above definition */
diff("n", df1, df2).show(false)
}
答案 1 :(得分:1)
这是一种在两个数据帧之间获取不常见行的方法:
val d1 = Seq((3, "Chennai", "rahman", "9848022330", 45000, "SanRamon"), (1, "Hyderabad", "ram", "9848022338", 50000, "SF"), (2, "Hyderabad", "robin", "9848022339", 40000, "LA"), (4, "sanjose", "romin", "9848022331", 45123, "SanRamon"))
val d2 = Seq((3, "Chennai", "rahman", "9848022330", 45000, "SanRamon"), (1, "Hyderabad", "ram", "9848022338", 50000, "SF"), (2, "Hyderabad", "robin", "9848022339", 40000, "LA"), (4, "sanjose", "romin", "9848022331", 45123, "SanRamon"), (4, "sanjose", "romino", "9848022331", 45123, "SanRamon"), (5, "LA", "Test", "1234567890", 12345, "Testuser"))
val df1 = d1.toDF("emp_id" ,"emp_city" ,"emp_name" ,"emp_phone" ,"emp_sal" ,"emp_site")
val df2 = d2.toDF("emp_id" ,"emp_city" ,"emp_name" ,"emp_phone" ,"emp_sal" ,"emp_site")
spark.sql("((select * from df1) union (select * from df2)) minus ((select * from df1) intersect (select * from df2))").show //spark is SparkSession