我有以下DataFrame
df = pd.DataFrame({'model': ['A0', 'A0', 'A1', 'A1','A0', 'A0', 'A1', 'A1', 'A0', 'A0', 'A1', 'A1'],
'y_true': [1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11],
'y_pred': [0, 1, 5, 5, 7, 8, 8, 12, 8, 7, 14, 15],
'week': [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]},
)
model y_true y_pred week
0 A0 1 0 1
1 A0 2 1 1
2 A1 3 5 1
3 A1 3 5 1
4 A0 4 7 2
5 A0 5 8 2
6 A1 6 8 2
7 A1 7 12 2
8 A0 8 8 3
9 A0 9 7 3
10 A1 10 14 3
11 A1 11 15 3
我想用sklearn进行一些指标演算,所以我做了这个功能
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
import numpy as np
def metrics(df):
y_true=np.asarray(df['y_true'])
y_pred=np.asarray(df['y_pred'])
mae=mean_absolute_error(y_true, y_pred)
mse=mean_squared_error(y_true, y_pred)
evs=explained_variance_score(y_true, y_pred)
return mae,mse,evs
我试图以此方式分组
df.groupby(['model', 'week']).apply(metrics)
它返回每周的指标,但是我希望指标从第1周到其他周一直是累积的。我的意思是:
1. For the results of week 1 I want the metrics of y_true and y_pred where the column week takes the value 1.
2. For the results of week 2 I want the metrics of y_true and y_pred where the column week takes the values 1 or 2
3. For the results of week 3 I want the metrics of y_true and y_pred where the column week takes the values 1, 2 or 3
这是部分解决方案,但这不是我想要的。
y_true y_pred y_true_cum \
model week
A0 1 [1, 2] [0, 1] [1, 2]
2 [4, 5] [7, 8] [1, 2, 4, 5]
3 [8, 9] [8, 7] [1, 2, 4, 5, 8, 9]
A1 1 [3, 3] [5, 5] [1, 2, 4, 5, 8, 9, 3, 3]
2 [6, 7] [8, 12] [1, 2, 4, 5, 8, 9, 3, 3, 6, 7]
3 [10, 11] [14, 15] [1, 2, 4, 5, 8, 9, 3, 3, 6, 7, 10, 11]
我希望每个模型都有自己的累积周数:
y_true y_pred y_true_cum \
model week
A0 1 [1, 2] [0, 1] [1, 2]
2 [4, 5] [7, 8] [1, 2, 4, 5]
3 [8, 9] [8, 7] [1, 2, 4, 5, 8, 9]
A1 1 [3, 3] [5, 5] [3, 3]
2 [6, 7] [8, 12] [ 3, 3, 6, 7]
3 [10, 11] [14, 15] [ 3, 3, 6, 7, 10, 11]
答案 0 :(得分:1)
这应该做到:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
df = pd.DataFrame({
'model': ['A0', 'A0', 'A1', 'A1','A0', 'A0', 'A1', 'A1', 'A0', 'A0', 'A1', 'A1'],
'week': [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
'y_true': [1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11],
'y_pred': [0, 1, 5, 5, 7, 8, 8, 12, 8, 7, 14, 15]
})
def metrics(df):
df['mae'] = mean_absolute_error(df.y_true, df.y_pred)
df['mse'] = mean_squared_error(df.y_true, df.y_pred)
df['evs'] = explained_variance_score(df.y_true, df.y_pred)
return df
# groupby model, week and keep all values of y_true/y_pred as lists
df_group = df.groupby(['model', 'week']).agg(list)
# accumulate values for y_true and y_pred
df_group = df_group.groupby('model')['y_true', 'y_pred'].apply(lambda x: x.cumsum())
# apply metrics to new columns
df_group.apply(metrics, axis=1)
答案 1 :(得分:0)
除了RubenB之外的答案:对他的代码进行了小的修改就可以满足要求。
此之后:
df_group = df.groupby(['model', 'week']).agg(lambda x: list(x))
我们可以在某些部分使用cumsum
:
for col in ['y_true','y_pred']:
df_group[f'{col}_cum'] = None
df_group = df_group.reset_index().set_index('model') #this is for convenience
for col in ['y_true','y_pred']:
for model in df_group.index: #now we do this once per model
df_group.loc[model,f'{col}_cum'] = df_group.loc[model,col].cumsum()
最后,就像RubenB所做的那样:
df_group.apply(metrics, axis=1)
在没有额外循环的情况下尝试-不过,这变成了一个混乱的lambda函数。
df_group = df.groupby(['model', 'week']).agg(lambda x: list(x))
df_group = df_group.reset_index()
for col in ['y_true','y_pred']:
df_group[f'{col}_cum'] = df_group.apply(lambda x:
df_group.loc[(df_group.model==x.model)&(df_group.week<=x.week),col].sum(),axis=1)
最后:
df_group.set_index(['model','week']).apply(metrics, axis=1)