我想存储卷并通过这些存储区在汇总数据上建立摘要报告。目前,我使用 apply 来执行此操作,但是 apply 对于大型数据集可能会非常慢。 create_lt_ten_buckets中提供了语法的通用形式吗?我猜这更多是一个麻木的事情,对此我不太熟悉。
def create_buckets(df_internal, comparison_operator, column_to_bucket, min_value, max_value, ranges_pivots):
low = [min_value] + ranges_pivots
high = ranges_pivots + [max_value]
ranges = list(zip(low, high))
max_str_len = len(str(max(high + low)))
def get_value(row):
count = 0
for l,h in ranges:
if comparison_operator(l, row[column_to_bucket]) and comparison_operator(row[column_to_bucket], h):
return "{}|{}_to_{}".format(str(count).zfill(max_str_len),l,h)
count+=1
return "OUTOFBAND"
df_internal["{}_BUCKETED".format(column_to_bucket)] = df_internal.apply(get_value, axis=1)
def create_lt_ten_bucket(df_internal, column_to_bucket):
df_internal["{}_is_lt_ten".format(column_to_bucket)] = df_internal[column_to_bucket] < 10
dftest = pd.DataFrame([1,2,3,4,5, 44, 250, 22], columns=["value_alpha"])
create_buckets(dftest, lambda v1,v2: v1 <= v2, "value_alpha", 0, 999, [1, 2, 5, 10, 25, 50, 100, 200])
display(dftest)
create_lt_ten_bucket(dftest, "value_alpha")
display(dftest)
dftest.groupby('value_alpha_BUCKETED').sum().sort_values('value_alpha_BUCKETED')
OUTPUT
value_alpha value_alpha_BUCKETED
0 1 000|0_to_1
1 2 001|1_to_2
2 3 002|2_to_5
3 4 002|2_to_5
4 5 002|2_to_5
5 44 005|25_to_50
6 250 008|200_to_999
7 22 004|10_to_25
dftest = pd.DataFrame([1,2,3,4,5, 44, 250, 22], columns=["value_alpha"])
create_buckets(dftest, lambda v1,v2: v1 <= v2, "value_alpha", 0, 999999999, [1, 2, 5, 10, 25, 50, 100, 200])
display(dftest)
create_lt_ten_bucket(dftest, "value_alpha")
display(dftest)
OUTPUT
value_alpha value_alpha_BUCKETED value_alpha_is_lt_ten
0 1 000|0_to_1 True
1 2 001|1_to_2 True
2 3 002|2_to_5 True
3 4 002|2_to_5 True
4 5 002|2_to_5 True
5 44 005|25_to_50 False
6 250 008|200_to_999 False
7 22 004|10_to_25 False
最后,我试图获得类似于以下数据的摘要:
dftest.groupby('value_alpha_BUCKETED').sum().sort_values('value_alpha_BUCKETED')
value_alpha value_alpha_is_lt_ten
value_alpha_BUCKETED
000|0_to_1 1 1.0
001|1_to_2 2 1.0
002|2_to_5 12 3.0
004|10_to_25 22 0.0
005|25_to_50 44 0.0
008|200_to_999 250 0.0
答案 0 :(得分:1)
我不清楚您要问什么,但是您所拥有的大致是pd.cut
和pd.DataFrame.groupby
:
dftest['new_bucket'] = pd.cut(dftest['value_alpha'], [0, 1, 2, 5, 10, 25, 50, 100, 200, 999])
dftest['value_alpha_is_lt_ten'] = dftest['value_alpha'] < 10
print(dftest.groupby("new_bucket").sum())
value_alpha value_alpha_is_lt_ten
new_bucket
(0, 1] 1 1.0
(1, 2] 2 1.0
(2, 5] 12 3.0
(5, 10] 0 0.0
(10, 25] 22 0.0
(25, 50] 44 0.0
(50, 100] 0 0.0
(100, 200] 0 0.0
(200, 999] 250 0.0
如果您不希望空存储桶,则可以将.query
的值value_alpha > 0