df = pd.DataFrame( {
'A': ['d','d','d','d','d','d','g','g','g','g','g','g','k'
,'k','k','k','k','k'],
'B': [5,5,6,7,5,6,-6,7,7,6,-7,7,-8,7,-6,6,-7,50],
'C': [1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2],
'S': [2012,2013,2014,2015,2016,2012,2012,2014,2015,2016
,2012,2013,2012,2013,2014,2015,2016,2014]
} );
df = (df.B + df.C).groupby([df.A,df.S]).agg(['sum','size'])
.unstack(fill_value=0)
df1 = df.groupby(level=0, axis=1).sum()
new_cols= list(zip(df1.columns.get_level_values(0),
['total'] * len(df.columns)))
df1.columns = pd.MultiIndex.from_tuples(new_cols)
df2 = pd.concat([df1,df], axis=1).sort_index(axis=1)
df2.columns = ['_'.join((col[0], str(col[1]))) for col in df2.columns]
df2.columns = df2.columns.str.replace('sum_','')
df2.columns = df2.columns.str.replace('size_','#')
df_without_2012 = df2.sort_index(axis=1).loc[:, '2013':'2016']
df2[((df2['2012'] < 0) | (df_without_2012.sum(axis=1) > 21))
& (df_without_2012 > 0).all(axis=1)]
#2012#2013#2014#2015#2016#total 2012 2013 2014 2015 2016 total
A
d 2 1 1 1 1 6 13 6 7 8 6 40
g 2 1 1 1 1 6 -11 8 8 8 7 20
2个问题;
df2
需要不返回'g',因为在最后一行中总和应该是
21岁以上,事实并非如此。什么是不正确的? EDIT;
-11 8 8 8 7 20 FALSE
-9 8 8 8 7 22 TRUE
8 8 8 8 7 39 TRUE
4 4 4 4 4 20 FALSE
21&amp; ALL COLUMNS +或2013:2016 +,2012 -
答案 0 :(得分:3)
说明很简单 - total
列是从2012
到2016
的总和列。
print (-11 + 8 + 8 + 8 + 7)
20
并且df_without_2012
仅汇总了从2013
到2016
的列,因此总和为31
:
print (8 + 8 + 8 + 7)
31
和
31 > 21
True
第二个问题的答案是添加.sort_index(axis=1, level=1)
:
df2 = pd.concat([df1,df], axis=1)
.sort_index(axis=1)
.sort_index(axis=1, level=1)
print (df2)
#2012 2012 #2013 2013 #2014 2014 #2015 2015 #2016 2016 #total \
A
d 2 13 1 6 1 7 1 8 1 6 6
g 2 -11 1 8 1 8 1 8 1 7 6
total
A
d 40
g 20
编辑:
如果有更多的面具,我认为最好的测试是使用:
df_without_2012 = df2.sort_index(axis=1).loc[:, '2013':'2016']
m1 = df2['2012'] < 0
m2 = df_without_2012.sum(axis=1) > **0**
m3 = (df_without_2012 > 0).all(axis=1)
m4 = df2.total > 21
print (m1)
print (m2)
print (m3)
print (m4)
mask = m1 & m2 **& m4** | m3 & m4
print (mask)
print (df2[mask])