我是Spark和Scala的新手并试图解决以下问题,但不能。请帮我解决一下这个。感谢您的帮助。
要求是逐列汇总值。 以下代码生成
val first = vlist.map(_.select("value"))
first.map(_.show())
输出:
first: Array[org.apache.spark.sql.DataFrame] =
Array([value: array<double>], [value: array<double>])
+--------------------+
| value|
+--------------------+
|[-0.047363, 0.187...|
|[0.043701, -0.114...|
|[-0.006439, 0.031...|
|[0.168945, 0.0639...|
|[0.049805, 0.0664...|
|[-0.054932, -0.11...|
|[0.094727, -0.118...|
|[0.136719, 0.1484...|
|[-0.12793, 0.2812...|
|[-0.071289, -0.07...|
|[0.115234, -0.012...|
|[0.253906, 0.0385...|
|[-0.062988, 0.031...|
|[0.110352, 0.2480...|
|[0.042725, 0.2949...|
|[-0.074219, 0.112...|
|[0.072754, -0.092...|
|[-0.063965, 0.058...|
|[0.083496, -0.007...|
|[0.043945, 0.1767...|
+--------------------+
only showing top 20 rows
+--------------------+
| value|
+--------------------+
|[0.045654, -0.145...|
|[0.053467, 0.0120...|
|[0.033203, -0.089...|
|[-0.08252, 0.0224...|
|[0.182617, -0.044...|
|[0.136719, 0.1484...|
|[0.112793, -0.130...|
|[0.096191, -0.028...|
|[-0.007141, 0.004...|
|[0.115234, -0.012...|
|[0.130859, 0.0084...|
|[-0.020874, 0.021...|
|[-0.267578, 0.084...|
|[-0.015015, 0.193...|
|[0.036865, 0.0201...|
|[0.205078, 0.0042...|
|[-0.013733, -0.07...|
|[0.175781, 0.2128...|
|[-0.061279, -0.06...|
|[0.058838, 0.3574...|
+--------------------+
下一步应该是列中所有值的总和。所以,理想情况下我应该最后一行。
我尝试了以下代码:
first.toList.transpose.map(_.sum)
输出:
<console>:183: error: No implicit view available from
org.apache.spark.sql.DataFrame => scala.collection.GenTraversableOnce[B].
first.toList.transpose.map(_.sum)
另外,我尝试将值分成不同的列(仅用于测试目的的4列)和应用的agg功能,如下所示:
var table = first
for (i <- 0 to 3) {
table = table.map(_.withColumn("vec_" + i, $"value"(i)))
}
var inter = table.map(_.drop("value"))
inter.map(_.show())
var exprs = inter.map(_.columns.map(_ -> "sum").toMap)
inter.agg(exprs)
输出:
table: Array[org.apache.spark.sql.DataFrame] =
Array([value: array<double>], [value: array<double>])
inter: Array[org.apache.spark.sql.DataFrame] =
Array([vec_0: double,
vec_1: double ... 2 more fields],
[vec_0: double,
vec_1: double ... 2 more fields])
+---------+---------+---------+---------+
| vec_0| vec_1| vec_2| vec_3|
+---------+---------+---------+---------+
|-0.047363| 0.1875| 0.002258| 0.173828|
| 0.043701|-0.114258| 0.067383|-0.060547|
|-0.006439| 0.031982| 0.012878| 0.020264|
| 0.168945| 0.063965|-0.084473| 0.173828|
| 0.049805| 0.066406| 0.03833| 0.02356|
|-0.054932|-0.117188| 0.027832| 0.074707|
| 0.094727|-0.118652| 0.118164| 0.253906|
| 0.136719| 0.148438| 0.114746| 0.069824|
| -0.12793| 0.28125| 0.01532|-0.046631|
|-0.071289| -0.07373| 0.199219|-0.069824|
| 0.115234|-0.012512|-0.022949| 0.194336|
| 0.253906| 0.038574|-0.030396| 0.248047|
|-0.062988| 0.031494|-0.302734| 0.030396|
| 0.110352| 0.248047| -0.00769|-0.031494|
| 0.042725| 0.294922| 0.019653| 0.030884|
|-0.074219| 0.112793| 0.094727| 0.071777|
| 0.072754|-0.092773|-0.174805|-0.022583|
|-0.063965| 0.058838| 0.086914| 0.320312|
| 0.083496|-0.007294|-0.026489| -0.05957|
| 0.043945| 0.176758| 0.094727|-0.083496|
+---------+---------+---------+---------+
only showing top 20 rows
+---------+---------+---------+---------+
| vec_0| vec_1| vec_2| vec_3|
+---------+---------+---------+---------+
| 0.045654|-0.145508| 0.15625| 0.166016|
| 0.053467| 0.012024| -0.0065| 0.008545|
| 0.033203|-0.089844|-0.294922| 0.115234|
| -0.08252| 0.022461|-0.149414| 0.099121|
| 0.182617|-0.044922| 0.138672| 0.011658|
| 0.136719| 0.148438| 0.114746| 0.069824|
| 0.112793|-0.130859| 0.066895| 0.138672|
| 0.096191|-0.028687|-0.108398| 0.145508|
|-0.007141| 0.004486| 0.02063| 0.010803|
| 0.115234|-0.012512|-0.022949| 0.194336|
| 0.130859| 0.008423| 0.033447|-0.058838|
|-0.020874| 0.021851|-0.083496|-0.072266|
|-0.267578| 0.084961| 0.109863| 0.086914|
|-0.015015| 0.193359| 0.014832| 0.07373|
| 0.036865| 0.020142| 0.22168| 0.155273|
| 0.205078| 0.004211| 0.084473| 0.091309|
|-0.013733|-0.074219| 0.017334|-0.016968|
| 0.175781| 0.212891|-0.071289| 0.084961|
|-0.061279|-0.068359| 0.120117| 0.191406|
| 0.058838| 0.357422| 0.128906|-0.162109|
+---------+---------+---------+---------+
only showing top 20 rows
res4164: Array[Unit] = Array((), ())
exprs: Array[scala.collection.immutable.Map[String,String]] = Array(Map(vec_0 -> sum, vec_1 -> sum, vec_2 -> sum, vec_3 -> sum), Map(vec_0 -> sum, vec_1 -> sum, vec_2 -> sum, vec_3 -> sum))
<console>:189: error: value agg is not a member of Array[org.apache.spark.sql.DataFrame]
inter.agg(exprs)
^
请帮助我。
我相信应该有一个简单的方法来做到这一点。提前致谢。
添加样本输入和输出。
示例输入:
first: Array[org.apache.spark.sql.DataFrame] =
Array([value: array<double>], [value: array<double>])
value
1,2,3,4,5,6 7,8
1,2,3,4,5,6,7,8
value
1,2,3,4,5,6 7,8
1,2,3,4,5,6,7,8
示例输出:
first: Array[org.apache.spark.sql.DataFrame] =
Array([value: array<double>], [value: array<double>])
value
2,4,6,8,10,14,16
value
2,4,6,8,10,14,16
答案 0 :(得分:0)
您可以尝试使用聚合方法,这些聚合方法具有名称功能&#39; sum&#39;这是聚合列的明智之处。
df.agg(sum("col1"), sum("col2"), ...)
希望这会对你有所帮助
答案 1 :(得分:0)
以下代码有效。感谢花时间在这上面的人们。欣赏它。
val first = vlist.map(_.select("value"))
first.map(_.show())
var table = first
for (i <- 0 to 3) {
table = table.map(_.withColumn("vec_" + i, $"value"(i)))
}
var inter = table.map(_.drop("value"))
inter.map(_.show())
//var exprs = inter.map(_.columns.map(_ -> "sum").toMap)
//inter.agg(exprs)
**val tab = inter.map(_.groupBy().sum())
tab.map(_.show())**
first: Array[org.apache.spark.sql.DataFrame] = Array([value: array<double>], [value: array<double>])
table: Array[org.apache.spark.sql.DataFrame] = Array([value: array<double>], [value: array<double>])
inter: Array[org.apache.spark.sql.DataFrame] = Array([vec_0: double, vec_1: double ... 2 more fields], [vec_0: double, vec_1: double ... 2 more fields])
tab: Array[org.apache.spark.sql.DataFrame] = Array([sum(vec_0): double, sum(vec_1): double ... 2 more fields], [sum(vec_0): double, sum(vec_1): double ... 2 more fields])
+------------------+------------------+------------------+------------------+
| sum(vec_0)| sum(vec_1)| sum(vec_2)| sum(vec_3)|
+------------------+------------------+------------------+------------------+
|2.5046410000000003|2.1487149999999997|1.0884870000000002|3.5877090000000003|
+------------------+------------------+------------------+------------------+
+------------------+------------------+----------+------------------+
| sum(vec_0)| sum(vec_1)|sum(vec_2)| sum(vec_3)|
+------------------+------------------+----------+------------------+
|0.9558040000000001|0.9843780000000002| 0.545025|0.9979860000000002|
+------------------+------------------+----------+------------------+
res325: Array[Unit] = Array((), ())
答案 2 :(得分:0)
如果你已经解决了问题,那就太棒了。将“Value”列转换为如上所述的不同列的数据帧之后。请执行以下操作。
val finalDf = df.groupBy().sum()
finalDf是包含列值方式总和的数据帧。
答案 3 :(得分:0)
如果我正确理解了这个问题,则可以使用'posexplode'来使带有索引的数组爆炸。这样就可以按索引进行分组和汇总。