Question

我有一个pandas数据框，就像前四列形成多索引一样：

import pandas as pd
data = [[1, 'A', 1, 0, 10],
        [1, 'A', 0, 1, 10],
        [1, 'A', 1, 1, 10],
        [1, 'A', 0, 0, 10],
        [1, 'B', 1, 0, 10],
        [1, 'B', 0, 1, 10],
        [1, 'B', 1, 1, 10],
        [1, 'B', 0, 0, 10]]
cols = ['user_id','type','flag1','flag2','cnt']
df = pd.DataFrame(data,columns = cols)
df = df.set_index(['user_id','type','flag1','flag2'])
print df

user_id    type    flag1    flag2    cnt
________________________________________
1          A       1        0        10
1          A       0        1        10
1          A       1        1        10
1          A       0        0        10
1          B       1        0        10
1          B       0        1        10
1          B       1        1        10
1          B       0        0        10

我想迭代索引值以获得每个唯一索引值的分组总计数，如下所示：

user_id    type    flag1    flag2    cnt
________________________________________
1          ALL     ALL      ALL      80
1          ALL     ALL      0        40
1          ALL     ALL      1        40
1          ALL     1        ALL      40
1          ALL     0        ALL      40
1          A       ALL      ALL      40
1          B       ALL      ALL      40
1          A       ALL      0        20
1          A       ALL      1        20
1          B       ALL      0        20
1          B       ALL      1        20
1          A       1        ALL      20
1          A       0        ALL      20
1          B       1        ALL      20
1          B       0        ALL      20
1          A       1        0        10
1          A       0        1        10
1          A       1        1        10
1          A       0        0        10
1          B       1        0        10
1          B       0        1        10
1          B       1        1        10
1          B       0        0        10

我能够使用query和groupby轻松生成每个组，但理想情况下，我希望能够迭代任意数量的索引列以获得cnt列的总和。

Answer 1

与之前的答案类似，使用itertools和groupby，这是一种稍微简化的方法：

from itertools import chain, combinations
indices = ['user_id','type','flag1','flag2']
powerset = list(chain.from_iterable(combinations(indices, r) for r in range(1,len(indices)+1)))

master = (pd.concat([df.reset_index().groupby(p, as_index=False).sum() 
                     for p in powerset if p[0] == "user_id"])[cols]
            .replace([None,4,2], "ALL")
            .sort_values("cnt", ascending=False))

输出：

user_id type flag1 flag2  cnt
0        1  ALL   ALL   ALL   80
0        1    A   ALL   ALL   40
1        1    B   ALL   ALL   40
0        1  ALL     0   ALL   40
1        1  ALL     1   ALL   40
0        1  ALL   ALL     0   40
1        1  ALL   ALL     1   40
3        1  ALL     1     1   20
2        1  ALL     1     0   20
1        1  ALL     0     1   20
0        1  ALL     0     0   20
3        1    B     1     1   20
2        1    B     1     0   20
1        1    A     1     1   20
0        1    A     1     0   20
3        1    B     1     1   20
2        1    B     0     1   20
1        1    A     1     1   20
0        1    A     0     1   20
0        1    A     0     0   10
1        1    A     0     1   10
2        1    A     1     0   10
3        1    A     1     1   10
4        1    B     0     0   10
5        1    B     0     1   10
6        1    B     1     0   10
7        1    B     1     1   10

powerset计算直接来自itertools文档。

Answer 2

我主要使用itertools中的combinations和product combinations适用于每列中的所有值组合 product适用于所有列的所有值组合。

import pandas as pd
from itertools import combinations, product
import numpy as np


def iterativeSum(df, cols, target_col):
    # All possible combinations within each column
    comb_each_col = []
    for col in cols:
        # Take 1 to n element in the unique set of values in each column
        each_col = [list(combinations(set(df[col]), i))
                    for i in range(1, len(set(df[col]))+1)]
        # Flat the list
        each_col = [list(x) for sublist in each_col for x in sublist]
        # Record the combination
        comb_each_col.append(each_col)
    # All possible combinations across all columns
    comb_all_col = list(product(*comb_each_col))
    result = pd.DataFrame()
    # Iterate over all combinations
    for value in comb_all_col:
        # Get condition which match the value in each column
        condition = np.array(
            [df[col].isin(v).values for col, v in zip(cols, value)]).all(axis=0)
        # Get the sum of rows which meet the condition
        condition_sum = df.loc[condition][target_col].sum()
        # Format values for output
        value2 = []
        for x in value:
            try:
                # String can be joined together directly
                value2.append(','.join(x))
            except:
                # Numbers can be joined after converted to string
                x = [str(y) for y in x]
                value2.append(','.join(x))
        # Put result into table
        result = pd.concat([result, pd.DataFrame([value2+[condition_sum]])])
    result.columns = cols + [target_col]
    return(result)

data = [[1, 'A', 1, 0, 10],
        [1, 'A', 0, 1, 10],
        [1, 'A', 1, 1, 10],
        [1, 'A', 0, 0, 10],
        [1, 'B', 1, 0, 10],
        [1, 'B', 0, 1, 10],
        [1, 'B', 1, 1, 10],
        [1, 'B', 0, 0, 10]]
cols = ['user_id', 'type', 'flag1', 'flag2', 'cnt']
df = pd.DataFrame(data, columns=cols)
# Columns for grouping
grouped_cols = ['type', 'flag1', 'flag2']
# Columns for summing
target_col = 'cnt'
print iterativeSum(df, grouped_cols, target_col)

结果：

  type flag1 flag2  cnt
0    A     0     0   10
0    A     0     1   10
0    A     0   0,1   20
0    A     1     0   10
0    A     1     1   10
0    A     1   0,1   20
0    A   0,1     0   20
0    A   0,1     1   20
0    A   0,1   0,1   40
0    B     0     0   10
0    B     0     1   10
0    B     0   0,1   20
0    B     1     0   10
0    B     1     1   10
0    B     1   0,1   20
0    B   0,1     0   20
0    B   0,1     1   20
0    B   0,1   0,1   40
0  A,B     0     0   20
0  A,B     0     1   20
0  A,B     0   0,1   40
0  A,B     1     0   20
0  A,B     1     1   20
0  A,B     1   0,1   40
0  A,B   0,1     0   40
0  A,B   0,1     1   40
0  A,B   0,1   0,1   80

Answer 3

#build all groupby key combinations
import itertools
keys = ['user_id', 'type', 'flag1', 'flag2']
key_combos = [c for i in range(len(keys)) for c in itertools.combinations(keys, i+1)]
#make sure only select the combos with 'user_id' in it
key_combos = [list(e) for e in key_combos if 'user_id' in e]
#groupby using all groupby keys and concatenate the results to a Dataframe
df2 = pd.concat([df.groupby(by=key).cnt.sum().to_frame().reset_index() for key in sorted(key_combos)])
#Fill na with ALL and re-order columns
df2.fillna('ALL')[['user_id','type','flag1','flag2','cnt']]

Out[521]: 
   user_id type flag1 flag2  cnt
0        1  ALL   ALL   ALL   80
0        1  ALL     0   ALL   40
1        1  ALL     1   ALL   40
0        1  ALL     0     0   20
1        1  ALL     0     1   20
2        1  ALL     1     0   20
3        1  ALL     1     1   20
0        1  ALL   ALL     0   40
1        1  ALL   ALL     1   40
0        1    A   ALL   ALL   40
1        1    B   ALL   ALL   40
0        1    A     0   ALL   20
1        1    A     1   ALL   20
2        1    B     0   ALL   20
3        1    B     1   ALL   20
0        1    A     0     0   10
1        1    A     0     1   10
2        1    A     1     0   10
3        1    A     1     1   10
4        1    B     0     0   10
5        1    B     0     1   10
6        1    B     1     0   10
7        1    B     1     1   10
0        1    A   ALL     0   20
1        1    A   ALL     1   20
2        1    B   ALL     0   20
3        1    B   ALL     1   20

在MultiIndex级别上迭代Pandas并在groupby中迭代以获得总数

3 个答案: