如何从apply中返回格式正确的pandas数据帧?

时间:2016-08-16 04:42:11

标签: python pandas dataframe multiple-columns nan

假设我们有以下数据框:

def func1(row):
    A=row['A']
    B=row['B']
    if A==float('nan'):
       if B==float('nan'):
          y=float('nan')
       else:
          y=B
    else:
       y=A
    return y
df['C']=df.apply(func1,axis=1)

分组年份和位置,然后应用如下函数:

import pandas as pd
import numpy as np

years = [2005, 2006]
location = ['city', 'suburb']
dft = pd.DataFrame({
    'year': [years[np.random.randint(0, 1+1)] for _ in range(100)],
    'location': [location[np.random.randint(0, 1+1)] for _ in range(100)],
    'days_to_complete': np.random.randint(100, high=600, size=100),
    'cost_in_millions': np.random.randint(1, high=10, size=100)
})

对结果调用unstack(2)我们得到以下输出:

def get_custom_summary(group):
    gt_200 = group.days_to_complete > 200
    lt_200 = group.days_to_complete < 200

    avg_days_gt200 = group[gt_200].days_to_complete.mean()
    avg_cost_gt200 = group[gt_200].cost_in_millions.mean()

    avg_days_lt200 = group[lt_200].days_to_complete.mean()
    avg_cost_lt200 = group[lt_200].cost_in_millions.mean()

    lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())

    return pd.DataFrame({
        'gt_200': {'AVG_DAYS': avg_days_gt200, 'AVG_COST': avg_cost_gt200}, 
        'lt_200': {'avg_days': avg_days_lt200, 'avg_cost': avg_cost_lt200},
        'lt_200_prop' : lt_200_prop
    })

result = dft.groupby(['year', 'location']).apply(get_custom_summary)

对于列print(result.unstack(2)) gt_200 lt_200 lt_200_prop AVG_COST AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS avg_cost avg_days AVG_COST AVG_DAYS avg_cost avg_days year location 2005 city 4.818182 415.636364 NaN NaN NaN NaN 7.250000 165.50 0.153846 0.153846 0.153846 0.153846 suburb 5.631579 336.631579 NaN NaN NaN NaN 5.166667 140.50 0.240000 0.240000 0.240000 0.240000 2006 city 4.130435 396.913043 NaN NaN NaN NaN 5.750000 150.75 0.258065 0.258065 0.258065 0.258065 suburb 5.294118 392.823529 NaN NaN NaN NaN 1.000000 128.00 0.055556 0.055556 0.055556 0.055556 gt_200,对lt_200的调用将删除填充了NaN的列,但dropna(axis=1)列仍然存在错误的列名称。我怎样才能从get_custom_summary返回一个没有广播的数据框(如果这是正确的单词)子列(lt_200_propAVG_COSTAVG_DAYS,{{1 }}到列(avg_costavg_daysgt_200)?

编辑:

期望的输出:

lt_200

1 个答案:

答案 0 :(得分:0)

返回一个Dataframe,其列设置为MultiIndex。

from collections import OrderedDict

def get_multi_index(ordered_dict):
    length = len(list(ordered_dict.values())[0])

    for k in ordered_dict:
        assert(len(ordered_dict[k]) == length)

    names = list()
    arrays = list()
    for k in ordered_dict:
        names.append(k)
        arrays.append(np.array(ordered_dict[k]))

    tuples = list(zip(*arrays))
    return pd.MultiIndex.from_tuples(tuples, names=names) 

def get_custom_summary(group):
    gt_200 = group.days_to_complete > 200
    lt_200 = group.days_to_complete < 200

    avg_days_gt_200 = group[gt_200].days_to_complete.mean()
    avg_cost_gt_200 = group[gt_200].cost_in_millions.mean()

   avg_days_lt_200 = group[lt_200].days_to_complete.mean()
   avg_cost_lt_200 = group[lt_200].cost_in_millions.mean()

   lt_200_prop = lt_200.sum() / (gt_200.sum() + lt_200.sum())

   ordered_dict = OrderedDict()
   ordered_dict['first'] = ['lt_200', 'lt_200', 'gt_200', 'gt_200', 'lt_200_prop']
   ordered_dict['second'] = ['avg_cost', 'avg_days', 'AVG_COST', 'AVG_DAYS', 'prop']

   data = [[avg_cost_lt_200, avg_days_lt_200, avg_cost_gt_200, avg_days_gt_200, lt_200_prop]]
   return pd.DataFrame(data, columns=get_multi_index(ordered_dict))

获取并打印结果:

result = dft.groupby(['year', 'location']).apply(get_custom_summary).xs(0, level=2)
print(result)

输出:

first            lt_200                gt_200             lt_200_prop
second         avg_cost    avg_days  AVG_COST    AVG_DAYS        prop
year location                                                        
2005 city      7.555556  135.444444  5.300000  363.750000    0.310345
     suburb    5.000000  137.333333  5.555556  444.222222    0.250000
2006 city      6.250000  169.000000  4.714286  422.380952    0.160000
     suburb    4.428571  133.142857  4.333333  445.666667    0.318182