Question

我通过以下方式创建数据框：

filtered_data.groupby('weekday').agg({'airing': np.sum, 'uplift': [np.sum,np.mean]})

创建一个表格：

    sum     sum     mean
weekday             
1   11  20  1.818182
2   24  46  1.916667
...

我想要的是包括最后一行，即每列的总和。

提前致谢！

Answer 1

您可以使用.loc功能来实现：

df.loc[len(df)] = [df[col].sum() for col in df.columns]

Answer 2

在这种情况下，您应该创建一个跟踪摘要统计信息的系列。如果你需要用于显示目的，你可以连接。

summary = pd.Series([filtered_data.airing.sum(), 
                     filtered_data.uplift.sum(), 
                     filtered_data.uplift.mean()], 
                    name='summary')

Answer 3

为此，我创建了一个聚合工具，其行为类似于SQL中的GROUPING SETS。提供用于分组和聚合函数的列，并返回聚合的DataFrame。

import itertools as it
import pandas as pd

from pandas.util.testing import assert_frame_equal

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return it.chain.from_iterable(it.combinations(s,r) for r in range(len(s)+1))

def grouper(df,grpby,aggfunc):
    ''' produces aggregate DataFrame from DataFrames for non-redundant groupings
       `workingdf` is used to avoid modifying original DataFrame
    '''
    uniqcols = set(col for col in grpby if len(df[col].unique()) == 1)
    subset = set()
    for col in uniqcols:
        for grp in powerset(grpby):
            if col in grp:
                subset.add(grp) # add level of aggregation only when non-redundant

    if len(subset) == 0: 
        for grp in powerset(grpby): 
            subset.add(grp)

    workingdf = df.copy()
    for idx,i in enumerate(subset):
        if i != (): 
            tmp = aggfunc( workingdf.groupby(i) )
        else:
            # hack to get output to be a DataFrameGroupBy object:
            #   insert dummy column on which to group by
            dummycolname = hash(tuple(workingdf.columns.tolist()))
            workingdf[dummycolname] = ''
            tmp = aggfunc( workingdf.groupby(dummycolname) )

        # drop the index and add it back
        if i == (): tmp.reset_index(drop=True,inplace=True)
        else: tmp.reset_index(inplace=True)

        for j in grpby: 
            if j not in tmp: # if column is not in DataFrame add it
                tmp[j] = '(All)'

        # new list with all columns including aggregate ones; do this only once
        if idx == 0:
            finalcols = grpby[:]
            addlcols = [k for k in tmp if k not in grpby] # aggregate columns 
            finalcols.extend(addlcols)

        # reorder columns 
        tmp = tmp[finalcols] 

        if idx == 0:
            final = tmp; del tmp
        else: 
            final = pd.concat( [final,tmp] ); del tmp

    del workingdf

    final.sort_values(finalcols,inplace=True)
    final.reset_index(drop=True,inplace=True)

    return final

def agg(grpbyobj):
    ''' the purpose of this function is to: 
          specify aggregate operation(s) you wish to perform, 
          name the resulting column(s) in the final DataFrame.
    '''
    tmp = pd.DataFrame()
    tmp['Total (n)'] = grpbyobj['Total'].sum()
    return tmp

if __name__ == '__main__':
    df = pd.DataFrame({'Area':['a','a','b',],
                       'Year':[2014,2014,2014,],
                       'Month':[1,2,3,],
                       'Total':[4,5,6,],})
    final = grouper(df,grpby=['Area','Year'],aggfunc=agg)

    # test against expected result 
    expected = pd.DataFrame({u'Year': {0: 2014, 1: 2014, 2: 2014}, 
                             u'Total (n)': {0: 15, 1: 9, 2: 6}, 
                             u'Area': {0: u'(All)', 1: u'a', 2: u'b'}})
    expected = expected[final.columns.tolist()] 
    try:
        # check_names kwarg True: compare indexes and columns 
        assert_frame_equal(final,expected,check_names=True)
    except AssertionError as e:
        raise

在熊猫数据框中创建总行数

3 个答案: