假设我们有以下数据框:
def func1(row):
A=row['A']
B=row['B']
if A==float('nan'):
if B==float('nan'):
y=float('nan')
else:
y=B
else:
y=A
return y
df['C']=df.apply(func1,axis=1)
分组年份和位置,然后应用如下函数:
import pandas as pd
import numpy as np
years = [2005, 2006]
location = ['city', 'suburb']
dft = pd.DataFrame({
'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
'days_to_complete': np.random.randint(100, high=600, size=100),
'cost_in_millions': np.random.randint(1, high=10, size=100)
})
对结果调用unstack(2)我们得到以下输出:
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt200 = group[gt_200].days_to_complete.mean()
avg_cost_gt200 = group[gt_200].cost_in_millions.mean()
avg_days_lt200 = group[lt_200].days_to_complete.mean()
avg_cost_lt200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
return pd.DataFrame({
'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200},
'lt_200': {'avg_days': avg_days_lt200, 'avg_cost': avg_cost_lt200},
'lt_200_prop' : lt_200_prop
})
result = dft.groupby(['year', 'location']).apply(get_custom_summary)
对于列print(result.unstack(2))
gt_200 lt_200 lt_200_prop
AVG_COST AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS avg_cost avg_days
year location
2005 city 4.818182 415.636364 NaN NaN NaN NaN 7.250000 165.50 0.153846 0.153846 0.153846 0.153846
suburb 5.631579 336.631579 NaN NaN NaN NaN 5.166667 140.50 0.240000 0.240000 0.240000 0.240000
2006 city 4.130435 396.913043 NaN NaN NaN NaN 5.750000 150.75 0.258065 0.258065 0.258065 0.258065
suburb 5.294118 392.823529 NaN NaN NaN NaN 1.000000 128.00 0.055556 0.055556 0.055556 0.055556
和gt_200
,对lt_200
的调用将删除填充了NaN的列,但dropna(axis=1)
列仍然存在错误的列名称。我怎样才能从get_custom_summary返回一个没有广播的数据框(如果这是正确的单词)子列(lt_200_prop
,AVG_COST
,AVG_DAYS
,{{1 }}到列(avg_cost
,avg_days
,gt_200
)?
编辑:
期望的输出:
lt_200
答案 0 :(得分:0)
返回一个Dataframe,其列设置为MultiIndex。
from collections import OrderedDict
def get_multi_index(ordered_dict):
length = len(list(ordered_dict.values())[0])
for k in ordered_dict:
assert(len(ordered_dict[k]) == length)
names = list()
arrays = list()
for k in ordered_dict:
names.append(k)
arrays.append(np.array(ordered_dict[k]))
tuples = list(zip(*arrays))
return pd.MultiIndex.from_tuples(tuples, names=names)
def get_custom_summary(group):
gt_200 = group.days_to_complete > 200
lt_200 = group.days_to_complete < 200
avg_days_gt_200 = group[gt_200].days_to_complete.mean()
avg_cost_gt_200 = group[gt_200].cost_in_millions.mean()
avg_days_lt_200 = group[lt_200].days_to_complete.mean()
avg_cost_lt_200 = group[lt_200].cost_in_millions.mean()
lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())
ordered_dict = OrderedDict()
ordered_dict['first'] = ['lt_200', 'lt_200', 'gt_200', 'gt_200', 'lt_200_prop']
ordered_dict['second'] = ['avg_cost', 'avg_days', 'AVG_COST', 'AVG_DAYS', 'prop']
data = [[avg_cost_lt_200, avg_days_lt_200, avg_cost_gt_200, avg_days_gt_200, lt_200_prop]]
return pd.DataFrame(data, columns=get_multi_index(ordered_dict))
获取并打印结果:
result = dft.groupby(['year', 'location']).apply(get_custom_summary).xs(0, level=2)
print(result)
输出:
first lt_200 gt_200 lt_200_prop
second avg_cost avg_days AVG_COST AVG_DAYS prop
year location
2005 city 7.555556 135.444444 5.300000 363.750000 0.310345
suburb 5.000000 137.333333 5.555556 444.222222 0.250000
2006 city 6.250000 169.000000 4.714286 422.380952 0.160000
suburb 4.428571 133.142857 4.333333 445.666667 0.318182