给定一个pandas时间序列(或numpy数组或简单的python列表,如果更容易),我希望,对于系列中的每个点,找到下次系列处于此级别的等待时间。因此,如果第T天为0且第T + 1天为正数,我想找到等待系列为0或以下的天数。如果第T + 1天为负数,则找到系列为0或以上的等待时间
import random
import pandas as pd
import numpy as np
np.random.seed(1234567)
N = 10
ts = pd.util.testing.makeTimeSeries(N).cumsum()
我可以用双循环
来做def min2(x):
return min(x) if len(x) > 0 else np.nan
out = ts*np.nan
for idx, (d,v) in enumerate(ts.iteritems()):
if idx+1 < N:
if ts[idx+1] > ts[idx]:
out[d] = min2([k for k in xrange(idx+2, N) if ts[k] <= ts[idx]]) - idx
elif ts[idx+1] < ts[idx]:
out[d] = min2([k for k in xrange(idx+2, N) if ts[k] >= ts[idx]]) - idx
else:
out[d] = 1
print ts
2000-01-03 -0.514625
2000-01-04 -0.964179
2000-01-05 0.770442
2000-01-06 1.413822
2000-01-07 1.439962
2000-01-10 1.520343
2000-01-11 0.722954
2000-01-12 0.094867
2000-01-13 -0.251360
2000-01-14 0.716725
Freq: B, dtype: float64
print out
2000-01-03 2.0
2000-01-04 NaN
2000-01-05 4.0
2000-01-06 3.0
2000-01-07 2.0
2000-01-10 NaN
2000-01-11 NaN
2000-01-12 2.0
2000-01-13 NaN
2000-01-14 NaN
Freq: B, dtype: float64
但是有一种有效的方法(大N)吗?
答案 0 :(得分:1)
这是一种使用两个堆栈的方法。遗憾的是很难实现矢量化。尽管如此,在我的10,000次样本测试中,它比原始循环快几百倍:
export class AppComponent {}
示例输出:
import numpy as np
freqs = np.random.randn(20)/10
N = 10**4
data = np.sin(np.arange(N)[:, None] * freqs).sum(axis=-1)
test = False
if test:
data = """
2000-01-03 -0.514625
2000-01-04 -0.964179
2000-01-05 0.770442
2000-01-06 1.413822
2000-01-07 1.439962
2000-01-10 1.520343
2000-01-11 0.722954
2000-01-12 0.094867
2000-01-13 -0.251360
2000-01-14 0.716725
"""
data = np.array([float(d.strip().split()[1])
for d in data.strip().split('\n')])
def min2(x):
return min(x) if len(x) > 0 else np.nan
def OP(data):
out = data*np.nan
for idx, v in enumerate(data):
if idx+1 < N:
if data[idx+1] > data[idx]:
out[idx] = min2([k for k in xrange(idx+2, N) if data[k] <= data[idx]]) - idx
elif data[idx+1] < data[idx]:
out[idx] = min2([k for k in xrange(idx+2, N) if data[k] >= data[idx]]) - idx
else:
out[idx] = 1
return out
def PP(data):
stack = np.empty(data.shape, int)
wait = np.zeros(data.shape) + np.nan
lp = 0
hp = -1
dd = np.lib.stride_tricks.as_strided(data, (data.size-1, 2),
2 * data.strides)
for j, (do, dn) in enumerate(dd):
if dn > do:
stack[lp] = j
lp += 1
while hp < -1 and dn >= data[stack[hp+1]]:
hp += 1
wait[stack[hp]] = j - stack[hp] + 1
elif dn < do:
stack[hp] = j
hp -= 1
while lp > 0 and dn <= data[stack[lp-1]]:
lp -= 1
wait[stack[lp]] = j - stack[lp] + 1
else:
wait[j] = 1
return wait
def check(data, wait):
w = np.where(~np.isnan(wait))[0]
assert np.all((data[w + 1] - data[w])
* (data[w + wait[w].astype(int)] - data[w]) <= 0)
assert np.all((data[w+1] - data[w])
* (data[w + wait[w].astype(int) - 1] - data[w]) >= 0)
print('test passed')
waito = OP(data)
wait = PP(data)
check(data, wait)
print('outputs equal',
np.all((wait==waito) | (np.isnan(wait) & np.isnan(waito))))
from timeit import timeit
print('\nTimings:')
for f in OP, PP:
print('{:16s} {:10.6f} ms'.format(f.__name__, timeit(
lambda: f(data), number=10) * 100))