我希望能够将两个数据帧的连接条件作为输入字符串传递。我们的想法是使连接足够通用,以便用户可以传递他们喜欢的条件。
这就是我现在正在做的事情。虽然它有效但我觉得它不干净。
val testInput =Array("a=b", "c=d")
val condition: Column = testInput.map(x => testMethod(x)).reduce((a,b) => a.and(b))
firstDataFrame.join(secondDataFrame, condition, "fullouter")
这是testMethod
def testMethod(inputString: String): Column = {
val splitted = inputString.split("=")
col(splitted.apply(0)) === col(splitted.apply(1))
}
需要帮助找出更好的方法来获取输入以动态生成连接条件
答案 0 :(得分:2)
不确定这样的自定义方法会带来太多好处,但是如果你必须沿着这条路走下去,我建议你在join
上进行覆盖:
以下示例代码:
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
def joinDFs(dfL: DataFrame, dfR: DataFrame, conditions: List[String], joinType: String) = {
val joinConditions = conditions.map( cond => {
val arr = cond.split("\\s+")
if (arr.size != 3) throw new Exception("Invalid join conditions!") else
arr(1) match {
case "<" => dfL(arr(0)) < dfR(arr(2))
case "<=" => dfL(arr(0)) <= dfR(arr(2))
case "=" => dfL(arr(0)) === dfR(arr(2))
case ">=" => dfL(arr(0)) >= dfR(arr(2))
case ">" => dfL(arr(0)) > dfR(arr(2))
case "!=" => dfL(arr(0)) =!= dfR(arr(2))
case _ => throw new Exception("Invalid join conditions!")
}
} ).
reduce(_ and _)
dfL.join(dfR, joinConditions, joinType)
}
val dfLeft = Seq(
(1, "2018-04-01", "p"),
(1, "2018-04-01", "q"),
(2, "2018-05-01", "r")
).toDF("id", "date", "value")
val dfRight = Seq(
(1, "2018-04-15", "x"),
(2, "2018-04-15", "y")
).toDF("id", "date", "value")
val conditions = List("id = id", "date <= date")
joinDFs(dfLeft, dfRight, conditions, "left_outer").
show
// +---+----------+-----+----+----------+-----+
// | id| date|value| id| date|value|
// +---+----------+-----+----+----------+-----+
// | 1|2018-04-01| p| 1|2018-04-15| x|
// | 1|2018-04-01| q| 1|2018-04-15| x|
// | 2|2018-05-01| r|null| null| null|
// +---+----------+-----+----+----------+-----+