我正在尝试使用部分列名称的for循环构造数据框。我试过这个如下。
N = 10000
J = [2012,2013,2014]
K = ['A','B','C','D','E','F','G','H']
L = ['h', 'd', 'a']
S = ['AR1','PO1','RU1']
np.random.seed(0)
df = pd.DataFrame(
{'Y':np.random.uniform(1,10,N),
'X':np.random.uniform(1,10,N),
'Z':np.random.uniform(1,10,N),
'J':np.random.choice(J,N),
'S':np.random.choice(S,N),
'R':np.random.choice(L,N)
})
for a in ["X", "Y", "Z"]:
for num in list(range(0, 10)):
df['bins_{}'.format(a)] = df.groupby('S')["{}".format(a)].apply(pd.qcut, q=10, labels=np.arange(10))
df['bin_{}_{}'.format(a, num)] = np.where((df['bins_{}'.format(a)] == num) & (df['R'] == 'a'),
(df['{}'.format(a)] * 2) - 2,
np.where((df['bins_{}'.format(a)] == num) & (df['R'] != 'a'), -2, 0))
df_Z0 = (df.bin_Z_0).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size']).unstack(fill_value=0)
df_Z1 = (df.bin_Z_1).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size']).unstack(fill_value=0)
df_Z2 = (df.bin_Z_2).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size']).unstack(fill_value=0)
# for a in [str.startswith('bin_Z')]:
# df_Z = (df.a).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size']).unstack(fill_value=0)
df_Z0
但是我希望它们从0到9,然后是其他列(X,Y等)。像这样:
有没有人知道如何解决这个问题?
答案 0 :(得分:2)
我认为你需要sort_index
第二级S
:
print (df_Z0.sort_index(level='S'))
编辑:
解决方案将0
替换为NaN
,然后combine_first
- 将NaN
替换为非NaN数据:
df_Z0 = (df.bin_Z_0).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size']).unstack()
.replace({0:np.nan})
df_Z1 = (df.bin_Z_1).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size']).unstack()
.replace({0:np.nan})
df_Z2 = (df.bin_Z_2).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size']).unstack()
.replace({0:np.nan})
print (df_Z0.combine_first(df_Z1).combine_first(df_Z2).fillna(0).sort_index(level='S'))
sum size
J 2012 2013 2014 2012 2013 2014
bins_Z S
0 AR1 -130.627936 -109.113076 -129.183069 114 102 115
1 AR1 -53.332480 -21.304159 -67.457064 111 102 118
2 AR1 -50.862985 34.782290 74.742122 106 111 113
3 AR1 0.000000 0.000000 0.000000 96 107 128
4 AR1 0.000000 0.000000 0.000000 109 103 118
5 AR1 0.000000 0.000000 0.000000 109 109 113
6 AR1 0.000000 0.000000 0.000000 113 88 129
7 AR1 0.000000 0.000000 0.000000 108 106 117
8 AR1 0.000000 0.000000 0.000000 121 107 102
9 AR1 0.000000 0.000000 0.000000 112 105 114
0 PO1 -102.842426 -104.375470 -143.081966 109 113 112
1 PO1 -24.968751 -28.222851 -57.769469 119 107 107
2 PO1 9.301271 -64.096715 -46.586959 108 116 109
3 PO1 0.000000 0.000000 0.000000 120 88 125
4 PO1 0.000000 0.000000 0.000000 108 110 115
5 PO1 0.000000 0.000000 0.000000 100 110 123
6 PO1 0.000000 0.000000 0.000000 132 109 92
7 PO1 0.000000 0.000000 0.000000 120 108 105
8 PO1 0.000000 0.000000 0.000000 109 114 110
9 PO1 0.000000 0.000000 0.000000 107 111 116
0 RU1 -110.563320 -124.948033 -121.455465 116 117 104
1 RU1 -17.513545 -48.205310 -32.040780 112 120 104
2 RU1 34.402681 3.013596 -23.436698 106 115 115
3 RU1 0.000000 0.000000 0.000000 134 94 108
4 RU1 0.000000 0.000000 0.000000 99 115 122
5 RU1 0.000000 0.000000 0.000000 134 113 89
6 RU1 0.000000 0.000000 0.000000 91 128 117
7 RU1 0.000000 0.000000 0.000000 105 115 116
8 RU1 0.000000 0.000000 0.000000 113 121 102
9 RU1 0.000000 0.000000 0.000000 101 125 111
如果需要使用0
替换来自其他数据框的非0
数据,并且0
列中没有非sum
数据重叠,则可以采用其他解决方案:
df_Z0 = (df.bin_Z_0).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size'])
.set_index('size', append=True)
df_Z1 = (df.bin_Z_1).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size'])
.set_index('size', append=True)
df_Z2 = (df.bin_Z_2).groupby([df.bins_Z, df.S, df.J]).agg(['sum','size'])
.set_index('size', append=True)
print (df_Z0.add(df_Z1).add(df_Z2).reset_index(level=3).unstack().sort_index(level='S'))
size sum
J 2012 2013 2014 2012 2013 2014
bins_Z S
0 AR1 114 102 115 -130.627936 -109.113076 -129.183069
1 AR1 111 102 118 -53.332480 -21.304159 -67.457064
2 AR1 106 111 113 -50.862985 34.782290 74.742122
3 AR1 96 107 128 0.000000 0.000000 0.000000
4 AR1 109 103 118 0.000000 0.000000 0.000000
5 AR1 109 109 113 0.000000 0.000000 0.000000
6 AR1 113 88 129 0.000000 0.000000 0.000000
7 AR1 108 106 117 0.000000 0.000000 0.000000
8 AR1 121 107 102 0.000000 0.000000 0.000000
9 AR1 112 105 114 0.000000 0.000000 0.000000
0 PO1 109 113 112 -102.842426 -104.375470 -143.081966
1 PO1 119 107 107 -24.968751 -28.222851 -57.769469
2 PO1 108 116 109 9.301271 -64.096715 -46.586959
3 PO1 120 88 125 0.000000 0.000000 0.000000
4 PO1 108 110 115 0.000000 0.000000 0.000000
5 PO1 100 110 123 0.000000 0.000000 0.000000
6 PO1 132 109 92 0.000000 0.000000 0.000000
7 PO1 120 108 105 0.000000 0.000000 0.000000
8 PO1 109 114 110 0.000000 0.000000 0.000000
9 PO1 107 111 116 0.000000 0.000000 0.000000
0 RU1 116 117 104 -110.563320 -124.948033 -121.455465
1 RU1 112 120 104 -17.513545 -48.205310 -32.040780
2 RU1 106 115 115 34.402681 3.013596 -23.436698
3 RU1 134 94 108 0.000000 0.000000 0.000000
4 RU1 99 115 122 0.000000 0.000000 0.000000
5 RU1 134 113 89 0.000000 0.000000 0.000000
6 RU1 91 128 117 0.000000 0.000000 0.000000
7 RU1 105 115 116 0.000000 0.000000 0.000000
8 RU1 113 121 102 0.000000 0.000000 0.000000
9 RU1 101 125 111 0.000000 0.000000 0.000000