我正在使用从另一个SO帖子中获取的此公式来计算pyspark中列的中值:
columns = ['id', 'dogs', 'cats']
vals = [(1, 2, 0),(2, 0, 1)]
df = sqlContext.createDataFrame(vals, columns)
df.approxQuantile(list(c for c in df.columns), [0.5], 0)
当df中的行数为奇数但行数为偶数时,该公式有效,它不是在中心元素之间取平均值(我的意思是排序后当然是平均值),而仅取第一个
例如,以上代码的结果是:
[[1.0], [0.0], [0.0]]
应为:
[[1.5], [1.0], [0.5]]
如何正确计算pyspark中的中值?
答案 0 :(得分:0)
这是一种获取中位数的怪异方法
import numpy as np
np.random.seed(2019)
def get_median(df, col):
# if # df rows are even get the "middle" values and average
num_rows = df.count()
if num_rows%2==0:
mid_pt = num_rows/2
q1 = df.approxQuantile( col, [0.5], 0)
q2 = df.approxQuantile( col, [(mid_pt+1)*1./num_rows], 0)
return(np.mean(q1+q2).tolist())
return(df.approxQuantile( col, [0.5], 0))
# now test it out
df = pd.DataFrame(data = np.random.rand(20,2), columns=['dogs', 'cats'])
df['id'] = list(range(20))
df = sqlContext.createDataFrame(df)
df.approxQuantile( "cats", [0.5], 0)
# [0.5783015865898744]
答案 1 :(得分:0)
我按组计算了准确的中位数(不使用numpy)。您可以通过删除Windows部件轻松地调整方法。
import pyspark.sql.functions as F
from pyspark.sql.window import Window
columns = ['group1_col', 'group2_col', 'value_col']
vals = [['a', 'aa', 1],
['a', 'aa', 2],
['b', 'aa', 2],
['b', 'aa', 0],
['c', 'aa', 0],
['c', 'bb', 1],
['d', 'bb', 10],
['d', 'bb', 20],
['d', 'bb', 30],
]
df = spark.createDataFrame(vals, columns)
def compute_median(self, col, median_name, by_col=None):
""" Method to be added to spark native DataFrame class """
df_without_null = self.filter(F.col(col).isNotNull())
window_spec = Window.partitionBy()
if by_col is not None:
window_spec = Window.partitionBy(by_col)
window_spec_order = window_spec.orderBy(col)
df = (df_without_null
.withColumn('row_number', F.row_number().over(window_spec_order))
.withColumn('total_rows', F.count(F.lit(1)).over(window_spec))
)
row_to_keep = (
df
.filter((F.col('row_number') == F.ceil(F.col('total_rows') / 2))
| ((F.col('total_rows') % 2 == 0)
& (F.col('row_number') == F.floor(F.col('total_rows') / 2) + 1)))
)
if by_col is None:
return row_to_keep.select(F.mean(F.col(col)).alias(median_name))
return row_to_keep.groupBy(by_col).agg(F.mean(F.col(col)).alias(median_name))
# Add method to DataFrame class
DataFrame.compute_median = compute_median
# med = df.compute_median("value_col", "global_median")
# med.collect()
# global_median
# 2.0
# med_group1 = df.compute_median("value_col", "median", 'group1_col')
# med_group1.collect()
# group1_col | median
# a | 1.5
# b | 1.0
# c | 0.5
# d | 20.0
# med_group2 = df.compute_median("value_col", "median", 'group2_col')
# med_group1.collect()
# group1_col | median
# aa | 1.5
# bb | 15.0
您可以检查我的过滤条件是否与此条件相同(更长,但也许更容易理解)
res = (df_rank_and_number_or_row
.filter(
((F.col('total_rows') % 2 == 0)
& ((F.col('row_number') == F.ceil(F.col('total_rows') / 2))
| (F.col('row_number') == F.floor(F.col('total_rows') / 2) + 1))
) |
(F.col('total_rows') % 2 != 0)
& ((F.col('row_number') == F.ceil(F.col('total_rows') / 2))
)
)