import pandas as pd
import numpy as np
from collections import OrderedDict
import gc
import datetime
df = pd.DataFrame(np.random.rand(10000000, 5), columns=['A', 'B', 'C', 'D', 'E'])
df['A'] = 0.25*((df['A']/0.25).astype(int))
df['B'] = 0.25*((df['B']/0.25).astype(int))
df['A']=df['A'].astype(str)
df['B']=df['B'].astype(str)
df['ix1'] = df['A']
df['ix2'] = df['B']
df['A1']=df['A'].astype('category')
df['B1']=df['B'].astype('category')
gc.collect()
问题1: groupby和apply函数需要比dataframe.count更长的时间,做同样的事情。我如何在这里优化?
这需要约17秒
df.groupby(['A', 'B']).apply(genSummary)
这只需要3秒
df.groupby(['A', 'B']).count()
问题2: 我需要将自定义函数应用于groupby对象。设置multiindex后,group by string比没有index ...
慢def genSummary(group):
return pd.Series(OrderedDict([('Counts', np.count_nonzero(group['C'])),
('Sum', np.sum(group['D'])),
('Wavg', np.ma.average(group['E'], weights=group['C'])),
('Wavg', np.ma.average(group['E']*(group['C']>0.5), weights=group['C']*(group['C']>0))),
])
)
1个字符串组需要〜8.1秒
df.groupby(['A', 'B']).apply(genSummary)
按类别划分的2组需要~6.3秒
df.groupby(['A1', 'B1']).apply(genSummary)
df.sort_values(['ix1', 'ix2'], inplace=True)
gc.collect()
df.set_index(['ix1', 'ix2'], inplace=True)
gc.collect()
3 set multiindex,group by string需要~7.2秒
df.groupby(['A', 'B']).apply(genSummary)
4 set multiindex,group by index需要〜5。秒
df.groupby(level=[0,1]).apply(genSummary)
5设置多索引,按类别分组需要~4.6秒
df.groupby(['A1', 'B1']).apply(genSummary)
df = df.reset_index()
gc.collect()
6 reset_index,按类别分组需要~4.9秒
df.groupby(['A1', 'B1']).apply(genSummary)
7 reset_index,group by string需要~6.2秒
df.groupby(['A', 'B']).apply(genSummary)