在尝试对数据框的列进行转换(null
和HashingTF
转换之后,我试图将IDFModel
值的信息保留在列上。
因此df.na.drop(column_name)
的情况在转换之前不应该使用。
执行以下操作可使Caused by: java.lang.NullPointerException
符合预期:
import org.apache.spark.ml.feature.{HashingTF, IDFModel}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, lit}
def seq2tfidf(df: DataFrame, tf: HashingTF, idf: IDFModel, column: String): DataFrame = {
val renamedDf = df.withColumnRenamed(column, "value")
val tfValues = tf.transform(withValueDf)
val idfValues = idf.transform(tfValues)
.withColumnRenamed("idfFeature", column)
.drop("tfFeature")
.drop("value")
idfValues
}
有效的方法是将null和notnull拆分为2个数据帧,然后最后进行并集。如下所示:
def seq2tfidf(df: DataFrame, tf: HashingTF, idf: IDFModel, column: String): DataFrame = {
val renamedDf = df.withColumnRenamed(column, "value")
val withValueDf = renamedDf.filter(col("value").isNotNull)
val withNullValueDf = renamedDf
.filter(col("value").isNull)
.withColumn("idfFeature", lit(null: SparseVector))
.drop("value")
val tfValues = tf.transform(withValueDf)
val idfValues = idf.transform(tfValues)
.withColumnRenamed("idfFeature", column)
.drop("tfFeature")
.drop("value")
idfValues.union(withNullValueDf)
}
工作正常,但似乎有点慢。还有另一种方法吗?