我正在尝试扩展数据帧的每一列。
首先,我将每列转换为矢量,然后使用ml MinMax Scaler。
除了简单地重复之外,是否有更好/更优雅的方法将相同的功能应用于每一列?
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.functions.udf
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.sql.DataFrame
val toVector = udf((vct:Double) => Vectors.dense(Array(vct)) )
val df = (Seq((1,5,3),(4,2,9),(7,8,6))).toDF("A","B","C")
val dfVec = df.withColumn("AVec",vectorizeDf('A))
.withColumn("BVec",toVector('B))
.withColumn("CVec",toVector('C))
def scaler (df:DataFrame,inputCol:String) = new MinMaxScaler()
.setInputCol(inputCol)
.setOutputCol(inputCol+"Scaled")
.setMax(1)
.setMin(0)
.fit(df)
.transform(df)
scaler(scaler(scaler(dfVec,"AVec"),"BVec"),"CVec")
+---+---+---+-----+-----+-----+----------+----------+----------+
| A| B| C| AVec| BVec| CVec|AVecScaled|BVecScaled|CVecScaled|
+---+---+---+-----+-----+-----+----------+----------+----------+
| 1| 5| 3|[1.0]|[5.0]|[3.0]| [0.0]| [0.5]| [0.0]|
| 4| 2| 9|[4.0]|[2.0]|[9.0]| [0.5]| [0.0]| [1.0]|
| 7| 8| 6|[7.0]|[8.0]|[6.0]| [1.0]| [1.0]| [0.5]|
+---+---+---+-----+-----+-----+----------+----------+----------+