原始数据库与此类似(尽管更大):
idx = [np.array(['Jan', 'Jan', 'Feb', 'Mar', 'Mar', 'Mar','Apr', 'Apr', 'May', 'Jun', 'Jun', 'Jun','Jul', 'Aug', 'Aug', 'Sep', 'Sep', 'Oct','Oct', 'Oct', 'Nov', 'Dic', 'Dic',]),np.array(['A', 'B', 'B', 'A', 'B', 'C', 'A', 'B', 'B', 'A', 'B', 'C','A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'A', 'B', 'C'])]
data = [{'x': 1}, {'x': 5}, {'x': 3}, {'x': 2}, {'x': 7}, {'x': 3},{'x': 1}, {'x': 6}, {'x': 3}, {'x': 5}, {'x': 2}, {'x': 3},{'x': 1}, {'x': 9}, {'x': 3}, {'x': 2}, {'x': 7}, {'x': 3}, {'x': 6}, {'x': 8}, {'x': 2}, {'x': 7}, {'x': 9}]
df = pd.DataFrame(data, index=idx, columns=['x'])
df.index.names=['date','type']
它看起来像这样:
x
date type
Jan A 1
B 5
Feb B 3
Mar A 2
B 7
C 3
Apr A 1
B 6
May B 3
Jun A 5
B 2
C 3
Jul A 1
Aug B 9
C 3
Sep A 2
B 7
Oct C 3
A 6
B 8
Nov A 2
Dic B 7
C 9
我的目标是改进以下代码以在数据框中创建一个新列(滚动移动平均权重不同)。我的代码是:
df=df.reset_index()
df['rolling']=0
for j in df['type'].unique():
list_1=list(df['x'][df['type']==j])
cumsum = [0]
list_2=list(df['x'][df['type']==j].index)
z=[]
for i, h in enumerate(list_1, 1):
if i>=4:
cumsum.append(0.2*list_1[i-4]+0.3*list_1[i-3]+0.5*list_1[i-2])
else:
cumsum.append('NaN')
cumsum.pop(0)
z.append(cumsum[0])
df['rolling'][list_2]=z
它看起来像这样:
date type x rolling
0 Jan A 1 NaN
1 Jan B 5 NaN
2 Feb B 3 NaN
3 Mar A 2 NaN
4 Mar B 7 NaN
5 Mar C 3 NaN
6 Apr A 1 NaN
7 Apr B 6 5.4
8 May B 3 5.7
9 Jun A 5 1.3
10 Jun B 2 4.7
11 Jun C 3 NaN
12 Jul A 1 3.2
13 Aug B 9 3.1
14 Aug C 3 NaN
15 Sep A 2 2.2
16 Sep B 7 5.7
17 Oct C 3 3
18 Oct A 6 2.3
19 Oct B 8 6.6
20 Nov A 2 3.8
21 Dic B 7 7.9
22 Dic C 9 3
**如果您的代码具有比我的代码更好的性能,那么知道它要快多少会很有趣。如果您认为您的代码更好,但是您不知道它的运行速度有多快,则无论如何都要发布它,因为我将意识到使用更大的数据框。谢谢!
答案 0 :(得分:0)
让我们尝试一下,看看这样做是否可以加快您的代码速度:
idx = [np.array(['Jan', 'Jan', 'Feb', 'Mar', 'Mar', 'Mar','Apr', 'Apr', 'May', 'Jun', 'Jun', 'Jun','Jul', 'Aug', 'Aug', 'Sep', 'Sep', 'Oct','Oct', 'Oct', 'Nov', 'Dic', 'Dic',]),np.array(['A', 'B', 'B', 'A', 'B', 'C', 'A', 'B', 'B', 'A', 'B', 'C','A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'A', 'B', 'C'])]
data = [{'x': 1}, {'x': 5}, {'x': 3}, {'x': 2}, {'x': 7}, {'x': 3},{'x': 1}, {'x': 6}, {'x': 3}, {'x': 5}, {'x': 2}, {'x': 3},{'x': 1}, {'x': 9}, {'x': 3}, {'x': 2}, {'x': 7}, {'x': 3}, {'x': 6}, {'x': 8}, {'x': 2}, {'x': 7}, {'x': 9}]
df = pd.DataFrame(data, index=idx, columns=['x'])
df.index.names=['date','type']
df['rolling'] = df.groupby('type')['x'].rolling(4).apply(lambda x: x[-4]*.2 + x[-3]*.3 + x[-2]*.5, raw=True)\
.reset_index(level=2, drop=True).swaplevel(0,1)
df
输出:
x rolling
date type
Jan A 1 NaN
B 5 NaN
Feb B 3 NaN
Mar A 2 NaN
B 7 NaN
C 3 NaN
Apr A 1 NaN
B 6 5.4
May B 3 5.7
Jun A 5 1.3
B 2 4.7
C 3 NaN
Jul A 1 3.2
Aug B 9 3.1
C 3 NaN
Sep A 2 2.2
B 7 5.7
Oct C 3 3.0
A 6 2.3
B 8 6.6
Nov A 2 3.8
Dic B 7 7.9
C 9 3.0
时间......
您的代码:
每个循环324 ms±1.55 ms(平均±标准偏差,共7次运行,每个循环1次)
此代码:
每个循环12.6 ms±138 µs(平均±标准偏差,共运行7次,每个循环100个循环)
答案 1 :(得分:0)
我正在尝试执行此解决方案,但出现错误。
我的代码:
import import pandas as pd
import numpy as np
idx = [np.array(['Jan', 'Jan', 'Feb', 'Mar', 'Mar', 'Mar','Apr', 'Apr', 'May', 'Jun', 'Jun', 'Jun','Jul', 'Aug', 'Aug', 'Sep', 'Sep', 'Oct','Oct', 'Oct', 'Nov', 'Dic', 'Dic',]),np.array(['A', 'B', 'B', 'A', 'B', 'C', 'A', 'B', 'B', 'A', 'B', 'C','A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'A', 'B', 'C'])]
data = [{'x': 1}, {'x': 5}, {'x': 3}, {'x': 2}, {'x': 7}, {'x': 3},{'x': 1}, {'x': 6}, {'x': 3}, {'x': 5}, {'x': 2}, {'x': 3},{'x': 1}, {'x': 9}, {'x': 3}, {'x': 2}, {'x': 7}, {'x': 3}, {'x': 6}, {'x': 8}, {'x': 2}, {'x': 7}, {'x': 9}]
df = pd.DataFrame(data, index=idx, columns=['x'])
df.index.names=['date','type']
df['rolling'] = df.groupby('type')['x'].rolling(4).apply(lambda x: x[-4]*.2 +
x[-3]*.3 + x[-2]*.5 ).reset_index(level=2, drop=True).swaplevel(0,1)
print(df)
我的输出:
ValueError Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in
apply(self, func, *args, **kwargs)
917 try:
--> 918 result = self._python_apply_general(f)
919 except Exception:
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in
_python_apply_general(self, f)
940 values,
--> 941 not_indexed_same=mutated or self.mutated)
942
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in
_wrap_applied_output(self, keys, values, not_indexed_same)
3611 return self._concat_objects(keys, values,
-> 3612
not_indexed_same=not_indexed_same)
3613 elif isinstance(values[0], DataFrame):
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in
_concat_objects(self, keys, values, not_indexed_same)
1135 levels=group_levels, names=group_names,
-> 1136 sort=False)
1137 else:
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in
concat(objs, axis, join, join_axes, ignore_index, keys, levels, names,
verify_integrity, sort, copy)
224 verify_integrity=verify_integrity,
--> 225 copy=copy, sort=sort)
226 return op.get_result()
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in
__init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index,
verify_integrity, copy, sort)
377
--> 378 self.new_axes = self._get_new_axes()
379
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in _get_new_axes(self)
457
--> 458 new_axes[self.axis] = self._get_concat_axis()
459 return new_axes
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in _get_concat_axis(self)
513 concat_axis = _make_concat_multiindex(indexes, self.keys,
--> 514 self.levels, self.names)
515
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in _make_concat_multiindex(indexes, keys, levels, names)
594 return MultiIndex(levels=levels, labels=label_list, names=names,
--> 595 verify_integrity=False)
596
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in __new__(cls, levels, labels, sortorder, names, dtype, copy, name, verify_integrity, _set_identity)
231 # handles name validation
--> 232 result._set_names(names)
233
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _set_names(self, names, level, validate)
694 'level {}, is already used for level '
--> 695 '{}.'.format(name, l, used[name]))
696
ValueError: Duplicated level name: "type", assigned to level 2, is already used for level 0.
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
<ipython-input-25-e57e374b3102> in <module>()
7 df.index.names=['date','type']
8
----> 9 df['rolling'] = df.groupby('type')['x'].rolling(4).apply(lambda x: x[-4]*.2 + x[-3]*.3 + x[-2]*.5 ).reset_index(level=2, drop=True).swaplevel(0,1)
10
11 print(df)
~/anaconda3/lib/python3.6/site-packages/pandas/core/window.py in apply(self, func, raw, args, kwargs)
1578 def apply(self, func, raw=None, args=(), kwargs={}):
1579 return super(Rolling, self).apply(
-> 1580 func, raw=raw, args=args, kwargs=kwargs)
1581
1582 @Substitution(name='rolling')
~/anaconda3/lib/python3.6/site-packages/pandas/core/window.py in apply(self, func, raw, args, kwargs)
1001
1002 return self._apply(f, func, args=args, kwargs=kwargs,
-> 1003 center=False, raw=raw)
1004
1005 def sum(self, *args, **kwargs):
~/anaconda3/lib/python3.6/site-packages/pandas/core/window.py in _apply(self, func, name, window, center, check_minp, **kwargs)
802 return x.apply(name, *args, **kwargs)
803
--> 804 return self._groupby.apply(f)
805
806
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
3467 examples=_apply_docs['series_examples']))
3468 def apply(self, func, *args, **kwargs):
-> 3469 return super(SeriesGroupBy, self).apply(func, *args, **kwargs)
3470
3471 @Appender(_agg_doc)
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
928
929 with _group_selection_context(self):
--> 930 return self._python_apply_general(f)
931
932 return result
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
939 keys,
940 values,
--> 941 not_indexed_same=mutated or self.mutated)
942
943 def _iterate_slices(self):
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _wrap_applied_output(self, keys, values, not_indexed_same)
3610 if isinstance(values[0], (Series, dict)):
3611 return self._concat_objects(keys, values,
-> 3612 not_indexed_same=not_indexed_same)
3613 elif isinstance(values[0], DataFrame):
3614 # possible that Series -> DataFrame by applied function
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _concat_objects(self, keys, values, not_indexed_same)
1134 result = concat(values, axis=self.axis, keys=group_keys,
1135 levels=group_levels, names=group_names,
-> 1136 sort=False)
1137 else:
1138
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, sort, copy)
223 keys=keys, levels=levels, names=names,
224 verify_integrity=verify_integrity,
--> 225 copy=copy, sort=sort)
226 return op.get_result()
227
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy, sort)
376 self.copy = copy
377
--> 378 self.new_axes = self._get_new_axes()
379
380 def get_result(self):
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in _get_new_axes(self)
456 new_axes[i] = ax
457
--> 458 new_axes[self.axis] = self._get_concat_axis()
459 return new_axes
460
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in _get_concat_axis(self)
512 else:
513 concat_axis = _make_concat_multiindex(indexes, self.keys,
--> 514 self.levels, self.names)
515
516 self._maybe_check_integrity(concat_axis)
~/anaconda3/lib/python3.6/site-packages/pandas/core/reshape/concat.py in _make_concat_multiindex(indexes, keys, levels, names)
593
594 return MultiIndex(levels=levels, labels=label_list, names=names,
--> 595 verify_integrity=False)
596
597 new_index = indexes[0]
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in __new__(cls, levels, labels, sortorder, names, dtype, copy, name, verify_integrity, _set_identity)
230 if names is not None:
231 # handles name validation
--> 232 result._set_names(names)
233
234 if sortorder is not None:
~/anaconda3/lib/python3.6/site-packages/pandas/core/indexes/multi.py in _set_names(self, names, level, validate)
693 'Duplicated level name: "{}", assigned to '
694 'level {}, is already used for level '
--> 695 '{}.'.format(name, l, used[name]))
696
697 self.levels[l].rename(name, inplace=True)
ValueError: Duplicated level name: "type", assigned to level 2, is already used for level 0.
答案 2 :(得分:0)
这里是numpy
版本。它提供了健康的速度(小示例为68倍)。由于它使用线性相关,因此如果您的实际窗口大于3,则该因子将变得更大,因为correlate
将切换到基于fft的更有效方法。
import numpy as np
import pandas as pd
from scipy import signal
idx = [np.array(['Jan', 'Jan', 'Feb', 'Mar', 'Mar', 'Mar','Apr', 'Apr', 'May', 'Jun', 'Jun', 'Jun','Jul', 'Aug', 'Aug', 'Sep', 'Sep', 'Oct','Oct', 'Oct', 'Nov', 'Dic', 'Dic',]),np.array(['A', 'B', 'B', 'A', 'B', 'C', 'A', 'B', 'B', 'A', 'B', 'C','A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'A', 'B', 'C'])]
data = [{'x': 1}, {'x': 5}, {'x': 3}, {'x': 2}, {'x': 7}, {'x': 3},{'x': 1}, {'x': 6}, {'x': 3}, {'x': 5}, {'x': 2}, {'x': 3},{'x': 1}, {'x': 9}, {'x': 3}, {'x': 2}, {'x': 7}, {'x': 3}, {'x': 6}, {'x': 8}, {'x': 2}, {'x': 7}, {'x': 9}]
df = pd.DataFrame(data, index=idx, columns=['x'])
df.index.names=['date','type']
df = df.reset_index()
weights = np.array((0.2,0.3,0.5))
def running_avg():
if 'running' in df.columns:
del df['running']
n = len(weights)
tp, x = df['type'].values, df['x'].values
sidx = np.argsort(tp, kind='stable')
stp = tp[sidx]
bnds = np.where(stp[1:] != stp[:-1])[0] + 1
running = np.empty(sidx.shape)
for bit in np.split(sidx, bnds):
running[bit[:n]] = np.nan
if len(bit) > n:
running[bit[n:]] = signal.correlate(x[bit[:-1]], weights, 'valid', 'auto')
df['running'] = running
def running_OP():
df['rolling']=0
for j in df['type'].unique():
list_1=list(df['x'][df['type']==j])
cumsum = [0]
list_2=list(df['x'][df['type']==j].index)
z=[]
for i, h in enumerate(list_1, 1):
if i>=4:
cumsum.append(0.2*list_1[i-4]+0.3*list_1[i-3]+0.5*list_1[i-2])
else:
cumsum.append('NaN')
cumsum.pop(0)
z.append(cumsum[0])
df['rolling'][list_2]=z
from timeit import repeat
T0 = np.array(repeat(running_OP, repeat=7, number=10))*100
print(f'\nOP: {T0.mean():.3f} ± {T0.std():.3f} ms')
T1 = np.array(repeat(running_avg, repeat=7, number=100))*10000
print(f'pp {T1.mean():.3f} ± {T1.std():.3f} \N{GREEK SMALL LETTER MU}s')
print("\nresults are " + ["different", "equal"][((np.isnan(df['running']) & np.isnan(df['rolling'].astype(float))) | (df['running'] == df['rolling'])).all()])
print(f'speedup roughly {T0.mean()/T1.mean()*1000:.0f}\N{MULTIPLICATION X}')
样品运行:
OP: 62.500 ± 0.473 ms
pp 903.769 ± 11.491 μs
results are equal
speedup roughly 69✕