我正在尝试使用Apache Spark使用唯一列“ ID”将记录更新并插入到旧的Dataframe中。
答案 0 :(得分:0)
为了更新数据框,您可以对唯一列执行“ left_anti”联接,然后与具有新记录的数据框进行UNION
def refreshUnion(oldDS: Dataset[_], newDS: Dataset[_], usingColumns: Seq[String]): Dataset[_] = {
val filteredNewDS = selectAndCastColumns(newDS, oldDS)
oldDS.join(
filteredNewDS,
usingColumns,
"left_anti")
.select(oldDS.columns.map(columnName => col(columnName)): _*)
.union(filteredNewDS.toDF)
}
def selectAndCastColumns(ds: Dataset[_], refDS: Dataset[_]): Dataset[_] = {
val columns = ds.columns.toSet
ds.select(refDS.columns.map(c => {
if (!columns.contains(c)) {
lit(null).cast(refDS.schema(c).dataType) as c
} else {
ds(c).cast(refDS.schema(c).dataType) as c
}
}): _*)
}
val df = refreshUnion(oldDS, newDS, Seq("ID"))
答案 1 :(得分:0)
Spark数据帧是不可变的结构。因此,您不能根据ID进行任何更新。
更新数据框的方法是合并较旧的数据框和较新的数据框,然后将合并的数据框保存在HDFS上。要更新较旧的ID,您将需要一些重复数据删除密钥(可能是时间戳)。
我正在Scala中为此添加示例代码。您需要使用uniqueId和时间戳列名称来调用merge
函数。时间戳记应为长。
case class DedupableDF(unique_id: String, ts: Long);
def merge(snapshot: DataFrame)(
delta: DataFrame)(uniqueId: String, timeStampStr: String): DataFrame = {
val mergedDf = snapshot.union(delta)
return dedupeData(mergedDf)(uniqueId, timeStampStr)
}
def dedupeData(dataFrameToDedupe: DataFrame)(
uniqueId: String,
timeStampStr: String): DataFrame = {
import sqlContext.implicits._
def removeDuplicates(
duplicatedDataFrame: DataFrame): Dataset[DedupableDF] = {
val dedupableDF = duplicatedDataFrame.map(a =>
DedupableDF(a(0).asInstanceOf[String], a(1).asInstanceOf[Long]))
val mappedPairRdd =
dedupableDF.map(row ⇒ (row.unique_id, (row.unique_id, row.ts))).rdd;
val reduceByKeyRDD = mappedPairRdd
.reduceByKey((row1, row2) ⇒ {
if (row1._2 > row2._2) {
row1
} else {
row2
}
})
.values;
val ds = reduceByKeyRDD.toDF.map(a =>
DedupableDF(a(0).asInstanceOf[String], a(1).asInstanceOf[Long]))
return ds;
}
/** get distinct unique_id, timestamp combinations **/
val filteredData =
dataFrameToDedupe.select(uniqueId, timeStampStr).distinct
val dedupedData = removeDuplicates(filteredData)
dataFrameToDedupe.createOrReplaceTempView("duplicatedDataFrame");
dedupedData.createOrReplaceTempView("dedupedDataFrame");
val dedupedDataFrame =
sqlContext.sql(s""" select distinct duplicatedDataFrame.*
from duplicatedDataFrame
join dedupedDataFrame on
(duplicatedDataFrame.${uniqueId} = dedupedDataFrame.unique_id
and duplicatedDataFrame.${timeStampStr} = dedupedDataFrame.ts)""")
return dedupedDataFrame
}