我有一个pandas数据框,就像前四列形成多索引一样:
import pandas as pd
data = [[1, 'A', 1, 0, 10],
[1, 'A', 0, 1, 10],
[1, 'A', 1, 1, 10],
[1, 'A', 0, 0, 10],
[1, 'B', 1, 0, 10],
[1, 'B', 0, 1, 10],
[1, 'B', 1, 1, 10],
[1, 'B', 0, 0, 10]]
cols = ['user_id','type','flag1','flag2','cnt']
df = pd.DataFrame(data,columns = cols)
df = df.set_index(['user_id','type','flag1','flag2'])
print df
user_id type flag1 flag2 cnt
________________________________________
1 A 1 0 10
1 A 0 1 10
1 A 1 1 10
1 A 0 0 10
1 B 1 0 10
1 B 0 1 10
1 B 1 1 10
1 B 0 0 10
我想迭代索引值以获得每个唯一索引值的分组总计数,如下所示:
user_id type flag1 flag2 cnt
________________________________________
1 ALL ALL ALL 80
1 ALL ALL 0 40
1 ALL ALL 1 40
1 ALL 1 ALL 40
1 ALL 0 ALL 40
1 A ALL ALL 40
1 B ALL ALL 40
1 A ALL 0 20
1 A ALL 1 20
1 B ALL 0 20
1 B ALL 1 20
1 A 1 ALL 20
1 A 0 ALL 20
1 B 1 ALL 20
1 B 0 ALL 20
1 A 1 0 10
1 A 0 1 10
1 A 1 1 10
1 A 0 0 10
1 B 1 0 10
1 B 0 1 10
1 B 1 1 10
1 B 0 0 10
我能够使用query和groupby轻松生成每个组,但理想情况下,我希望能够迭代任意数量的索引列以获得cnt列的总和。
答案 0 :(得分:1)
与之前的答案类似,使用itertools
和groupby
,这是一种稍微简化的方法:
from itertools import chain, combinations
indices = ['user_id','type','flag1','flag2']
powerset = list(chain.from_iterable(combinations(indices, r) for r in range(1,len(indices)+1)))
master = (pd.concat([df.reset_index().groupby(p, as_index=False).sum()
for p in powerset if p[0] == "user_id"])[cols]
.replace([None,4,2], "ALL")
.sort_values("cnt", ascending=False))
输出:
user_id type flag1 flag2 cnt
0 1 ALL ALL ALL 80
0 1 A ALL ALL 40
1 1 B ALL ALL 40
0 1 ALL 0 ALL 40
1 1 ALL 1 ALL 40
0 1 ALL ALL 0 40
1 1 ALL ALL 1 40
3 1 ALL 1 1 20
2 1 ALL 1 0 20
1 1 ALL 0 1 20
0 1 ALL 0 0 20
3 1 B 1 1 20
2 1 B 1 0 20
1 1 A 1 1 20
0 1 A 1 0 20
3 1 B 1 1 20
2 1 B 0 1 20
1 1 A 1 1 20
0 1 A 0 1 20
0 1 A 0 0 10
1 1 A 0 1 10
2 1 A 1 0 10
3 1 A 1 1 10
4 1 B 0 0 10
5 1 B 0 1 10
6 1 B 1 0 10
7 1 B 1 1 10
powerset
计算直接来自itertools
文档。
答案 1 :(得分:0)
我主要使用itertools中的combinations
和product
combinations
适用于每列中的所有值组合
product
适用于所有列的所有值组合。
import pandas as pd
from itertools import combinations, product
import numpy as np
def iterativeSum(df, cols, target_col):
# All possible combinations within each column
comb_each_col = []
for col in cols:
# Take 1 to n element in the unique set of values in each column
each_col = [list(combinations(set(df[col]), i))
for i in range(1, len(set(df[col]))+1)]
# Flat the list
each_col = [list(x) for sublist in each_col for x in sublist]
# Record the combination
comb_each_col.append(each_col)
# All possible combinations across all columns
comb_all_col = list(product(*comb_each_col))
result = pd.DataFrame()
# Iterate over all combinations
for value in comb_all_col:
# Get condition which match the value in each column
condition = np.array(
[df[col].isin(v).values for col, v in zip(cols, value)]).all(axis=0)
# Get the sum of rows which meet the condition
condition_sum = df.loc[condition][target_col].sum()
# Format values for output
value2 = []
for x in value:
try:
# String can be joined together directly
value2.append(','.join(x))
except:
# Numbers can be joined after converted to string
x = [str(y) for y in x]
value2.append(','.join(x))
# Put result into table
result = pd.concat([result, pd.DataFrame([value2+[condition_sum]])])
result.columns = cols + [target_col]
return(result)
data = [[1, 'A', 1, 0, 10],
[1, 'A', 0, 1, 10],
[1, 'A', 1, 1, 10],
[1, 'A', 0, 0, 10],
[1, 'B', 1, 0, 10],
[1, 'B', 0, 1, 10],
[1, 'B', 1, 1, 10],
[1, 'B', 0, 0, 10]]
cols = ['user_id', 'type', 'flag1', 'flag2', 'cnt']
df = pd.DataFrame(data, columns=cols)
# Columns for grouping
grouped_cols = ['type', 'flag1', 'flag2']
# Columns for summing
target_col = 'cnt'
print iterativeSum(df, grouped_cols, target_col)
结果:
type flag1 flag2 cnt
0 A 0 0 10
0 A 0 1 10
0 A 0 0,1 20
0 A 1 0 10
0 A 1 1 10
0 A 1 0,1 20
0 A 0,1 0 20
0 A 0,1 1 20
0 A 0,1 0,1 40
0 B 0 0 10
0 B 0 1 10
0 B 0 0,1 20
0 B 1 0 10
0 B 1 1 10
0 B 1 0,1 20
0 B 0,1 0 20
0 B 0,1 1 20
0 B 0,1 0,1 40
0 A,B 0 0 20
0 A,B 0 1 20
0 A,B 0 0,1 40
0 A,B 1 0 20
0 A,B 1 1 20
0 A,B 1 0,1 40
0 A,B 0,1 0 40
0 A,B 0,1 1 40
0 A,B 0,1 0,1 80
答案 2 :(得分:0)
#build all groupby key combinations
import itertools
keys = ['user_id', 'type', 'flag1', 'flag2']
key_combos = [c for i in range(len(keys)) for c in itertools.combinations(keys, i+1)]
#make sure only select the combos with 'user_id' in it
key_combos = [list(e) for e in key_combos if 'user_id' in e]
#groupby using all groupby keys and concatenate the results to a Dataframe
df2 = pd.concat([df.groupby(by=key).cnt.sum().to_frame().reset_index() for key in sorted(key_combos)])
#Fill na with ALL and re-order columns
df2.fillna('ALL')[['user_id','type','flag1','flag2','cnt']]
Out[521]:
user_id type flag1 flag2 cnt
0 1 ALL ALL ALL 80
0 1 ALL 0 ALL 40
1 1 ALL 1 ALL 40
0 1 ALL 0 0 20
1 1 ALL 0 1 20
2 1 ALL 1 0 20
3 1 ALL 1 1 20
0 1 ALL ALL 0 40
1 1 ALL ALL 1 40
0 1 A ALL ALL 40
1 1 B ALL ALL 40
0 1 A 0 ALL 20
1 1 A 1 ALL 20
2 1 B 0 ALL 20
3 1 B 1 ALL 20
0 1 A 0 0 10
1 1 A 0 1 10
2 1 A 1 0 10
3 1 A 1 1 10
4 1 B 0 0 10
5 1 B 0 1 10
6 1 B 1 0 10
7 1 B 1 1 10
0 1 A ALL 0 20
1 1 A ALL 1 20
2 1 B ALL 0 20
3 1 B ALL 1 20