val someData = Seq(
Row("test.zip", "abc1","A"),
Row("test.zip", "abc1","B"),
Row("test.zip", "abc3","C")
)
val someSchema = List(
StructField("file_name", StringType, true),
StructField("id", StringType, true),
StructField("chart_char", StringType, true)
)
val someDF = spark.createDataFrame(
spark.sparkContext.parallelize(someData),
StructType(someSchema)
)
需要以下结果,即相应文件名的不同ID的总数
预期结果:
file_name,count
test.zip 2
答案 0 :(得分:0)
val df = Seq(
("test.zip", "abc1","A"),
("test.zip", "abc1","B"),
("test.zip", "abc3","C"),
("test2.zip", "abc1","C")
).toDF("file_name","id","chart_char")
df.show()
//+---------+----+----------+
//|file_name| id|chart_char|
//+---------+----+----------+
//| test.zip|abc1| A|
//| test.zip|abc1| B|
//| test.zip|abc3| C|
//|test2.zip|abc1| C|
//+---------+----+----------+
df.select("file_name", "id")
.distinct()
.groupBy("file_name")
.count
.show
//+---------+-----+
//|file_name|count|
//+---------+-----+
//| test.zip| 2|
//|test2.zip| 1|
//+---------+-----+