如何映射数据帧列以获得并行结果?

时间:2018-05-21 15:34:47

标签: scala apache-spark-sql

我有一个包含许多列的数据框,并希望得到max,min,std,median,mean和no null count。 我用循环得到了数据帧的结果。但它太慢了。我的代码如下:

def std(sparkSession: SparkSession, feature:String, df:DataFrame): DataFrame = {
    val df1 = df.select(feature).filter(x=>x(0)!=null && x(0)!="null") // need filter null and "null"
    val max = df1.rdd.map(x=>x(0).toString.toDouble).max()
    val min = df1.rdd.map(x=>x(0).toString.toDouble).min()
    val sum = df1.rdd.map(x=>x(0).toString.toDouble).sum()
    val count = df1.count().toDouble
    val mean = sum / count
    val median = df1.stat.approxQuantile(feature, Array(0.5), 0.001)(0)

    val variance = df1.withColumn("power", col(feature)*col(feature)).select("power").rdd.map(_(0).toString.toDouble).sum() / count - mean * mean
    val std = breeze.numerics.sqrt(variance)

    import sparkSession.implicits._
    Seq(
      (feature, "count", count.toString)
      ,(feature, "median", median.toString)
      ,(feature, "max", max.toString())
      ,(feature, "min", min.toString())
      ,(feature, "mean", mean.toString)
      ,(feature, "std", std.toString)
    ).toDF("feature", "type", "value")
  }

for(j <- 0 until stdArr.length) {
  var stdDF = std(sparkSession, stdArr(j), userFeatureDF)
  stdDF += stdDF  // pseudo code, get the union of all dataframe
}

stdArr是数据框userFeatureDF的列名数组。

如何以并行模式获得最终stdDF

0 个答案:

没有答案