df = spark.createDataFrame([
("2017-Dec-08 00:00 - 2017-Dec-09 00:00", 80.65,"abc"),
("2017-Dec-09 00:00 - 2017-Dec-10 00:00", 100,"abc"),
("2017-Dec-08 00:00 - 2017-Dec-09 00:00", 65,"def"),
("2017-Dec-09 00:00 - 2017-Dec-10 00:00", 78.02,"def")
]).toDF("date", "percent","device")
我需要使用avg
来应用groupbyschema = StructType([
StructField('date', StringType(), True),
StructField('percent', FloatType(), True),
StructField('device', StringType(), True)
])
dtaDF.groupBy("device").agg(round(mean("percent").alias("y"),2))
我面临以下异常
TypeError: a float is required
答案 0 :(得分:0)
>>> df = sqlContext.createDataFrame([
... ("2017-Dec-08 00:00 - 2017-Dec-09 00:00", 80.65,"abc"),
... ("2017-Dec-09 00:00 - 2017-Dec-10 00:00", 100.00,"abc"),
... ("2017-Dec-08 00:00 - 2017-Dec-09 00:00", 65.00,"def"),
... ("2017-Dec-09 00:00 - 2017-Dec-10 00:00", 78.02,"def")
... ]).toDF("date", "percent","device")
>>> schema = StructType([
... StructField('date', StringType(), True),
... StructField('percent', FloatType(), True),
... StructField('device', StringType(), True)
... ])
>>> df.groupBy("device").agg(round(mean("percent").alias("y"),2)).show()
+------+--------------------------------------------------------------+
|device|round((avg(percent),mode=Complete,isDistinct=false) AS y#16,2)|
+------+--------------------------------------------------------------+
| def| 71.51|
| abc| 90.33|
+------+--------------------------------------------------------------+
答案 1 :(得分:0)
如果要使用UDF:
@pandas_udf("float", PandasUDFType.GROUPED_AGG)
def mean_udf(v):
return round(v.mean(), 2)
spark.udf.register("mean_udf", mean_udf)
dfStackOverflow = spark.createDataFrame([
("2017-Dec-08 00:00 - 2017-Dec-09 00:00", 80.65,"abc"),
("2017-Dec-09 00:00 - 2017-Dec-10 00:00", 100.00,"abc"),
("2017-Dec-08 00:00 - 2017-Dec-09 00:00", 65.00,"def"),
("2017-Dec-09 00:00 - 2017-Dec-10 00:00", 78.02,"def")
],
schema = StructType([
StructField('date', StringType(), True),
StructField('percent', FloatType(), True),
StructField('device', StringType(), True)
]))
dfStackOverflow.groupBy("device").agg({"percent":"mean_udf"}).show()
+------+-----------------+
|device|mean_udf(percent)|
+------+-----------------+
| abc| 90.32|
| def| 71.51|
+------+-----------------+