我的应用程序需要计算Pandas多级数据帧的滚动总和,我想找到一种缩短处理时间的方法。
mul_df()
是创建演示多级数据帧的函数。
import itertools
import numpy as np
import pandas as pd
def mul_df(level1_rownum, level2_rownum, col_num):
''' create multilevel dataframe '''
index_name = ['IDX_1','IDX_2']
col_name = ['COL'+str(x).zfill(3) for x in range(col_num)]
first_level_dt = [['A'+str(x).zfill(4)]*level2_rownum for x in range(level1_rownum)]
first_level_dt = list(itertools.chain(*first_level_dt))
second_level_dt = ['B'+str(x).zfill(3) for x in range(level2_rownum)]*level1_rownum
dt = pd.DataFrame(np.random.randn(level1_rownum*level2_rownum, col_num), columns=col_name)
dt[index_name[0]] = first_level_dt
dt[index_name[1]] = second_level_dt
rst = dt.set_index(index_name, drop=True, inplace=False)
return rst
例如:
>>> df = mul_df(4,5,3)
COL000 COL001 COL002
IDX_1 IDX_2
A0000 B000 0.2317 -0.6122 0.2289
B001 -0.9218 -0.2918 1.7295
B002 0.1368 0.6659 -1.9193
B003 0.3839 -0.8542 -0.3065
B004 2.0361 -0.4601 1.1246
A0001 B000 0.3039 -0.6761 1.3762
B001 1.1767 0.8465 -0.1745
B002 0.4937 1.6774 -0.3038
B003 -0.3627 -1.6413 -0.7373
B004 -0.0149 1.5900 0.3385
A0002 B000 0.0326 0.2637 1.7990
B001 -0.1071 0.6097 -0.2812
B002 -0.2199 0.7360 1.9425
B003 -1.0423 0.6763 -0.2479
B004 -0.9024 0.3016 -2.7585
A0003 B000 0.2550 0.0470 0.6849
B001 0.5986 0.3283 1.6327
B002 0.8929 -1.1128 -0.9495
B003 -0.5633 1.7935 0.1652
B004 1.0417 -0.4833 0.3413
并使用以下命令计算“IDX_1”每个列数据组的滚动总和(窗口大小为4):
>>> df.groupby(level='IDX_1').apply(lambda x: pd.rolling_sum(x,4))
COL000 COL001 COL002
IDX_1 IDX_2
A0000 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 -0.1694 -1.0923 -0.2675
B004 1.6350 -0.9402 0.6282
A0001 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 1.6116 0.2064 0.1606
B004 1.2928 2.4726 -0.8771
A0002 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 -1.3367 2.2857 3.2125
B004 -2.2717 2.3236 -1.3451
A0003 B000 NaN NaN NaN
B001 NaN NaN NaN
B002 NaN NaN NaN
B003 1.1832 1.0559 1.5334
B004 1.9699 0.5256 1.1898
>>>
我尝试为大数据帧计算rolling_sum()
。
In [1]: df = mul_df(1000,25,1000)
In [2]: timeit df.groupby(level='IDX_1').apply(lambda x: pd.rolling_sum(x,4))
1 loops, best of 3: 52.1 s per loop
(1000 * 25,1000)数据帧的成本为52.1秒。如何加速rolling_sum(我的意思是有其他方法可以实现相同的计算结果,但是花费的时间更少)?
编辑(为waitingkuo的解决方案添加内存错误消息)
In [1]: df = mul_df(1000,25,1000)
In [2]: k2 = df.frs(4)
---------------------------------------------------------------------------
MemoryError Traceback (most recent call last)
<ipython-input-2-1b54b2662162> in <module>()
----> 1 k2 = df.frs(4)
F:\STK Analysis\Kits\Dev_Tools\FinReporter\FM_CORE.pyc in wrapped(*args, **kwargs)
149 from datetime import datetime
150 t1 = datetime.now()
--> 151 rst = fn(*args, **kwargs)
152 t2 = datetime.now()
153 print "Time: %0.3f"%((t2-t1).seconds + (t2-t1).microseconds/1000000.0)
F:\STK Analysis\Kits\Dev_Tools\FinReporter\FM_CORE.pyc in _frs(df, n)
864 ''' fast_rolling_sum , http://stackoverflow.com/questions/15652343/how-to-speed-up-pandas-rolling-sum '''
865 grp = df.groupby(level='STK_ID')
--> 866 return np.sum([grp.shift(i) for i in range(n)])
867 DataFrame.frs = _frs
868
D:\Python\lib\site-packages\pandas\core\groupby.pyc in wrapper(*args, **kwargs)
259 return self.apply(curried_with_axis)
260 except Exception:
--> 261 return self.apply(curried)
262
263 return wrapper
D:\Python\lib\site-packages\pandas\core\groupby.pyc in apply(self, func, *args, **kwargs)
320 func = _intercept_function(func)
321 f = lambda g: func(g, *args, **kwargs)
--> 322 return self._python_apply_general(f)
323
324 def _python_apply_general(self, f):
D:\Python\lib\site-packages\pandas\core\groupby.pyc in _python_apply_general(self, f)
323
324 def _python_apply_general(self, f):
--> 325 keys, values, mutated = self.grouper.apply(f, self.obj, self.axis)
326
327 return self._wrap_applied_output(keys, values,
D:\Python\lib\site-packages\pandas\core\groupby.pyc in apply(self, f, data, axis, keep_internal)
583 if hasattr(splitter, 'fast_apply') and axis == 0:
584 try:
--> 585 values, mutated = splitter.fast_apply(f, group_keys)
586 return group_keys, values, mutated
587 except lib.InvalidApply:
D:\Python\lib\site-packages\pandas\core\groupby.pyc in fast_apply(self, f, names)
2136 return [], True
2137
-> 2138 sdata = self._get_sorted_data()
2139 results, mutated = lib.apply_frame_axis0(sdata, f, names, starts, ends)
2140
D:\Python\lib\site-packages\pandas\core\groupby.pyc in _get_sorted_data(self)
2103
2104 def _get_sorted_data(self):
-> 2105 return self.data.take(self.sort_idx, axis=self.axis)
2106
2107 def _chop(self, sdata, slice_obj):
D:\Python\lib\site-packages\pandas\core\frame.pyc in take(self, indices, axis)
2900 new_values = com.take_2d(self.values,
2901 com._ensure_int64(indices),
-> 2902 axis=axis)
2903 if axis == 0:
2904 new_columns = self.columns
D:\Python\lib\site-packages\pandas\core\common.pyc in take_2d(arr, indexer, out, mask, needs_masking, axis, fill_value)
426 elif dtype_str in ('float64', 'object', 'datetime64[ns]'):
427 if out is None:
--> 428 out = np.empty(out_shape, dtype=arr.dtype)
429 take_f = _get_take2d_function(dtype_str, axis=axis)
430 take_f(arr, _ensure_int64(indexer), out=out, fill_value=fill_value)
MemoryError:
In [3]:
答案 0 :(得分:4)
如何先移动它然后将它们加在一起?
In [223]: def my_rolling_sum(d, n):
.....: g = d.groupby(level='IDX_1')
.....: return np.sum([g.shift(i) for i in range(n)])
.....:
让我们看看表现:
In [224]: df = mul_df(1000,25,1000)
In [225]: timeit df.groupby(level='IDX_1').apply(lambda x: pd.rolling_sum(x,4))
1 loops, best of 3: 32.4 s per loop
In [230]: timeit my_rolling_sum(df, 4)
1 loops, best of 3: 7.15 s per loop
虽然内存耗费太多,但我尝试对其进行一些修改:
In [5]: def my_rolling_sum(d, n):
...: g = d.groupby(level='IDX_1')
...: result = g.shift(0)
...: for i in range(1, n):
...: result = result + g.shift(i)
...:
希望它对你有所帮助。