答案 0 :(得分:1)
尝试使用 groupBy,count
Example:
df.show()
#+---+-------------------+
#| ID| TIME|
#+---+-------------------+
#| 1|07-24-2019,19:47:36|
#| 2|07-24-2019,20:43:39|
#| 1|07-24-2019,20:47:36|
#| 1|07-24-2019,19:47:36|
#+---+-------------------+
from pyspark.sql.functions import *
df.groupBy("ID","TIME").\
agg(count(col("ID")).alias("count")).\
orderBy("ID","TIME").\
show()
#or using time as aggregation
df.groupBy("ID","TIME").\
agg(count(col("TIME")).alias("count")).\
orderBy("ID","TIME").\
show()
#+---+-------------------+-----+
#| ID| TIME|count|
#+---+-------------------+-----+
#| 1|07-24-2019,19:47:36| 2|
#| 1|07-24-2019,20:47:36| 1|
#| 2|07-24-2019,20:43:39| 1|
#+---+-------------------+-----+