示例输入:
Item_Id Item_Name Buyer's_Id Buyers_Name
0001 Keyboard 10000 ABC
0002 Monitor 10010 XYZ
0001 Keyboard 10005 DXC
示例中间输出:
0001,Keyboard,{"Buyer's_Id":"10000","Buyers_Name":"ABC"}
0002,Monitor,{"Buyer's_Id":"10010","Buyers_Name":"XYZ"}
0001,Keyboard,{"Buyer's_Id":"10005","Buyers_Name":"DXC"}
最终输出:
0001,Keyboard,[{"Buyer's_Id":"10000","Buyers_Name":"Abc"},{"Buyer's_Id":"10005","Buyers_Name":"DXC"}]
0002,Monitor,[{"Buyer's_Id":"10010","Buyers_Name":"XYZ"}]
答案 0 :(得分:1)
您想要实现的目标可以使用
完成 map
处理每一行
mapPartitions
处理每个分区
scala> input_df.show
+-------+---------+----------+-----------+
|Item_Id|Item_Name|Buyer's_Id|Buyers_Name|
+-------+---------+----------+-----------+
| 1| Keyboard| 10000| ABC|
| 2| Monitor| 10010| XYZ|
| 1| Keyboard| 10005| DXC|
+-------+---------+----------+-----------+
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.collect_set
由于您的中间数据帧具有不同的架构。所以我们需要定义新的架构
scala> val schema = StructType(Seq(
| StructField("item_number", IntegerType),
| StructField("item_name", StringType),
| StructField("json_string", StringType)
| ))
scala> val encoder = RowEncoder(schema)
scala> val intermediate_df = input_df.map{row =>
| val itm_nbr = row.getAs[Integer]("Item_Id")
| val itm_nme = row.getAs[String]("Item_Name")
| val byer_id = row.getAs[Integer]("Buyer's_Id")
| val byer_nme = row.getAs[String]("Buyers_Name")
| val req_string = s"""{"Buyer's_id" : $byer_id,"Buyers_Name" : $byer_nme}"""
| Row(itm_nbr,itm_nme,req_string)
| }(encoder)
intermediate_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [item_number: int, item_name: string ... 1 more field]
scala> intermediate_df.show(false)
+-----------+---------+-------------------------------------------+
|item_number|item_name|json_string |
+-----------+---------+-------------------------------------------+
|1 |Keyboard |{"Buyer's_id" : 10000,"Buyers_Name" : ABC}|
|2 |Monitor |{"Buyer's_id" : 10010,"Buyers_Name" : XYZ}|
|1 |Keyboard |{"Buyer's_id" : 10005,"Buyers_Name" : DXC}|
+-----------+---------+-------------------------------------------+
scala> val result_df = intermediate_df.groupBy('item_number,'item_name).agg(collect_set('json_string).as("json_list")).orderBy('item_number)
result_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [item_number: int, item_name: string ... 1 more field]
scala> result_df.show(false)
+-----------+---------+------------------------------------------------------------------------------------------+
|item_number|item_name|json_list |
+-----------+---------+------------------------------------------------------------------------------------------+
|1 |Keyboard |[{"Buyer's_id" : 10000,"Buyers_Name" : ABC}, {"Buyer's_id" : 10005,"Buyers_Name" : DXC}]|
|2 |Monitor |[{"Buyer's_id" : 10010,"Buyers_Name" : XYZ}] |
+-----------+---------+------------------------------------------------------------------------------------------+
希望这有用!
答案 1 :(得分:0)
dF.select(
$"Item_Id",
$"Item_Name",
map(
lit("Buyer's_Id"),$"Buyer's_Id",
lit("Buyers_Name"),$"Buyers_Name"
).as("newCol")
).groupBy("Item_Id","Item_Name")
.agg(
collect_set($"newCol").as("mapCol")
).orderBy("Item_Id")