这是我的df
:
import pandas as pd
df = pd.DataFrame({'id': [1,1,1,1,2,2,3,3,3],
'col1': [7,6,12,1,3,6,10,11,12],
'col2': [1.2,0.8,0.9,1.1,2.0,1.8,0.7,0.9,1.2]})
我想应用2个函数,每个函数严格返回1个输出。
def myfunc1(g):
var1 = g['col1'].iloc[0]
var2 = g.loc[g['col2'] > 1, 'col1'].iloc[0]
return var1 / var2
def myfunc2(g):
var1 = g['col1'].iloc[0]
var2 = g.loc[g['col2'] < 1, 'col1'].iloc[0]
return var2 - var1
如果以这种方式运行它们,代码将失败:
df[['new_col1','new_col2']] = df.groupby("id").apply(myfunc1,myfunc2)
但是,如果我分别运行它们(见下文),则一切正常:
df['new_col1'] = df.groupby("id").apply(myfunc1)
df['new_col2'] = df.groupby("id").apply(myfunc2)
预期输出应包含以下列:
答案 0 :(得分:1)
您只能调用一个函数,因此可能的解决方案是创建另一个函数:
def myfunc1(g):
var1 = g['col1'].iloc[0]
#return missing value if no match
var2 = next(iter(g.loc[g['col2'] > 1, 'col1']), np.nan)
return var1 / var2
def myfunc2(g):
var1 = g['col1'].iloc[0]
#return missing value if no match
var2 = next(iter(g.loc[g['col2'] < 1, 'col1']), np.nan)
return var2 - var1
def f(x):
return pd.Series([myfunc1(x), myfunc2(x)], index=['new_col1','new_col2'])
df1 = df.groupby("id").apply(f)
print (df1)
new_col1 new_col2
id
1 1.000000 -1.0
2 1.000000 NaN
3 0.833333 0.0
或同时创建两个新功能:
def myfunc3(g):
var1 = g['col1'].iloc[0]
var2 = next(iter(g.loc[g['col2'] > 1, 'col1']), np.nan)
var3 = next(iter(g.loc[g['col2'] < 1, 'col1']), np.nan)
return pd.Series([var1 / var2, var3 - var1], index=['new_col1','new_col2'])
df1 = df.groupby("id").apply(myfunc3)
print (df1)
new_col1 new_col2
id
1 1.000000 -1.0
2 1.000000 NaN
3 0.833333 0.0