我有一个实木复合地板文件,其中包含两列(id,特征)。我想从标量中减去特征,然后将输出除以另一个标量。 parquet file
df.withColumn("features", ((df("features")-constant1)/constant2))
但是给我错误
要求失败:列数不匹配。老专栏 名称(2):id,功能新列名称(1):功能 如何解决?
答案 0 :(得分:0)
我的scala火花代码如下。对向量sparkm数据类型执行任何操作的唯一方法是强制转换为字符串。还使用UDF进行减法和除法。
let data = [
{
age: 25,
name: 'Michael'
},
{
age: 20,
name: 'David'
}
]
let name = data.map(`${prop('name')}`).join(',')
let age = data.map(`${prop('age')}`).join(',') // 25, 20
function prop(string) {
return value[string]
}
console.log(name) //Michael, David
console.log(age) // 25, 20
结果
import spark.implicits._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.functions._
var df = Seq((1, Vectors.dense(35)),
(2, Vectors.dense(45)),
(3, Vectors.dense(4.5073)),
(4, Vectors.dense(56)))
.toDF("id", "features")
df.show()
val constant1 = 10
val constant2 = 2
val performComputation = (s: Double, val1: Int, val2: Int) => {
Vectors.dense((s - val1) / val2)
}
val performComputationUDF = udf(performComputation)
df.printSchema()
df = df.withColumn("features",
regexp_replace(df.col("features").cast("String"),
"[\\[\\]]", "").cast("Double")
)
df = df.withColumn("features",
performComputationUDF(df.col("features"),
lit(constant1), lit(constant2))
)
df.show(20, false)
// Write State should with mode overwrite
df.write
.mode("overwrite")
.parquet("file:///usr/local/spark/dataset/output1/")