我在python中有一个定义
问题在于它非常慢。数组“ a”可能是100k +,需要7-10分钟才能完成
我发现我需要在初始迭代后对数组进行迭代,因为求平均值后,有时平均值可能在要再次求平均值的范围内
我考虑过将其分成多个块并使用多处理,我担心的是一个块的结尾,而下一个块的开头也需要平均。
def reshape_arr(a, close):
"""Iterates through 'a' to find values +- 'close', and averages them, then returns a distinct array of values"""
flag = True
while flag:
array = a.sort_values().unique()
l = len(array)
flag = False
for i in range(l):
previous_item = next_item = None
if i > 0:
previous_item = array[i - 1]
if i < (l - 1):
next_item = array[i + 1]
if previous_item is not None:
if abs(array[i] - previous_item) < close:
average = (array[i] + previous_item) / 2
flag = True
#find matching values in a, and replace with the average
a.replace(previous_item, value=average, inplace=True)
a.replace(array[i], value=average, inplace=True)
if next_item is not None:
if abs(next_item - array[i]) < close:
flag = True
average = (array[i] + next_item) / 2
# find matching values in a, and replace with the average
a.replace(array[i], value=average, inplace=True)
a.replace(next_item, value=average, inplace=True)
return a.unique()
a是一个DataFrame中的Pandas.Series,它的行介于0到200k之间,而close是一个int(例如100)
它有效,只是非常慢。
答案 0 :(得分:2)
您可以使用以下函数来产生与您相似的输出(不同之处在于,函数的结果是未排序的,因为a
从未在循环外排序,并且pd.Series.unique
返回的值是外观;如果确实需要,请检查第二个功能)。不需要在每次循环迭代时对数组进行排序,因为用已排序数组中两个后续(唯一)项的平均值替换不会使排序无效。由于在每次迭代中,与next_item
的比较就是与prev_item
的比较,因此在下一次迭代中,您只需成对比较一次后续元素即可。
def solve_sorted(a, close):
"""Returns the reduced unique values as a sorted array."""
a = a.sort_values().values.astype(float)
while True:
a = np.unique(a)
comp = a[1:] - a[:-1] < close
if not comp.sum():
break
indices = np.tile(comp.nonzero()[0][:, None], (1, 2))
indices[:, 1] += 1
avg = a[indices].mean(axis=1)
a[indices.ravel()] = np.repeat(avg, 2)
return np.unique(a)
如果保留元素的原始顺序很重要,则可以在开始时存储一次反向排序索引,以在末尾恢复原始顺序:
def solve_preserve_order(a, close):
"""Returns the reduced unique values in their original order."""
reverse_indices = np.argsort(np.argsort(a.values))
a = a.sort_values()
while True:
b = a.unique()
comp = b[1:] - b[:-1] < close
if not comp.sum():
break
indices = np.tile(comp.nonzero()[0][:, None], (1, 2))
indices[:, 1] += 1
avg = b[indices].mean(axis=1)
a.replace(b[indices.ravel()], np.repeat(avg, 2), inplace=True)
return a.iloc[reverse_indices].unique()
答案 1 :(得分:2)
首先,如果输入数组a
的长度较大且close
相对较小,则您提出的算法可能在数值上不稳定。
话虽如此,这里有一些想法可以将时间复杂度从O(N^3)
降低到O(N)
(对于近似实现)或O(N^2)
(对于等效实现)。对于N=100
,对于6000
和arr
的某些选择,这可以使速度加快close
。
考虑一个输入数组arr = [a,b,c,d]
,并假设该close > d - a
。在这种情况下,算法的执行过程如下:
[a,b,c,d]
[(a+b)/2,(b+c)/2,(c+d)/2]
[(a+2b+c)/4,(b+2c+d)/4]
[(a+3b+3c+d)/8]
人们可以认识到,如果[x_1, x_2, ..., x_n]
是arr
s.t.的最大连续子数组。 x_i - x_{i-1} < close
,然后[x_1, x_2, ..., x_n]
最终求值为(sum_{k=0}^{k=n} x_k * c_{n,k})/(2^(n-1))
,其中c_{n,k}
是二项式系数n choose k
。
这提供了一个O(N)
实现,如下所示:
import numpy as np
from scipy.stats import binom
from scipy.special import comb
def binom_mean(arr, scipy_cutoff=64):
"""
Given an array arr, returns an average of arr
weighted by binomial coefficients.
"""
n = arr.shape[0]
if arr.shape[0] == 1:
return arr[0]
# initializing a scipy binomial random variable can be slow
# so, if short runs are likely, we can speed things up
# by doing explicit computations
elif n < scipy_cutoff:
return np.average(arr, weights=comb(n-1, np.arange(n), exact=False))
else:
f = binom(n-1, 0.5).pmf
return np.average(arr, weights=f(np.arange(n)))
def reshape_arr_binom(arr, close):
d = np.ediff1d(arr, to_begin=0) < close
close_chunks = np.split(arr, np.where(~d)[0])
return np.fromiter(
(binom_mean(c) for c in close_chunks),
dtype=np.float
)
结果在10e-15
的实现np.random.seed(0);N=1000;cost=1/N;arr=np.random.rand(N)
之内。但是,对于大N
来说,除非cost
小,否则可能没有意义。对于上述参数值,这比我的计算机上原始代码快270
倍。
但是,如果我们选择一个适度的值N = 100
并将close
设置为一个较大的值,例如1
,则加速系数是6000
的一倍。这是因为对于大的close
,原始实现是O(N^3)
;具体来说,a.replace
可能被调用O(N^2)
次,成本为O(N)
。因此,当相邻元素很可能靠近时,可以实现最大的加速。
作为参考,这是一个与您的代码等效的O(N^2)
实现(我不建议在实践中使用此实现)。
import pandas as pd
import numpy as np
np.random.seed(0)
def custom_avg(arr, indices, close):
new_indices = list()
last = indices[-1]
for i in indices:
if arr[i] - arr[i-1] < close:
new_indices.append(i)
avg = (arr[i-1] + arr[i]) / 2
arr[i-1] = avg
if i != last and arr[i+1] - arr[i] >= close:
arr[i] = avg
return new_indices
def filter_indices(indices):
new_indices = list()
second_dups = list()
# handle empty index case
if not indices:
return new_indices, second_dups
for i, j in zip(indices, indices[1:]):
if i + 1 == j:
# arr[i] is guaranteed to be different from arr[i-1]
new_indices.append(i)
else:
# arr[i+1] is guaranteed to be a duplicate of arr[i]
second_dups.append(i)
second_dups.append(indices[-1])
return new_indices, second_dups
def reshape_arr_(arr, close):
indices = range(1, len(arr))
dup_mask = np.zeros(arr.shape, bool)
while indices:
indices, second_dups = filter_indices(custom_avg(arr, indices, close))
# print(f"n_inds = {len(indices)};\tn_dups = {len(second_dups)}")
dup_mask[second_dups] = True
return np.unique(arr[~dup_mask])
基本思想如下:
首先,考虑两个相邻的元素(i,j)
和j = i + 1
。如果arr[j] - arr[i] >= close
在当前迭代中,则arr[j] - arr[i] >= close
在当前迭代后还保留 。这是因为arr[i]
只能减少而arr[j]
只能增加。因此,如果(i,j)
对在当前迭代中未平均,则不会在任何后续迭代中平均。因此,我们可以避免以后再查看(i,j)
。
第二,如果(i,j)
是平均值而(i+1,j+1)
不是平均值,则我们知道arr[i]
是arr[j]
的副本。另外,每次迭代中最后修改的元素始终是重复的。
基于这些观察,我们需要在每次迭代中处理越来越少的索引。最糟糕的情况仍然是O(N^2)
,可以通过设置close = arr.max() - arr.min() + 1
来见证。
一些基准:
from timeit import timeit
make_setup = """
from __main__ import np, pd, reshape_arr, reshape_arr_, reshape_arr_binom
np.random.seed(0)
arr = np.sort(np.unique(np.random.rand({N})))
close = {close}""".format
def benchmark(N, close):
np.random.seed(0)
setup = make_setup(N=N, close=close)
print('Original:')
print(timeit(
stmt='reshape_arr(pd.Series(arr.copy()), close)',
# setup='from __main__ import reshape_arr; import pandas as pd',
setup=setup,
number=1,
))
print('Quadratic:')
print(timeit(
stmt='reshape_arr_(arr.copy(), close)',
setup=setup,
number=10,
))
print('Binomial:')
print(timeit(
stmt='reshape_arr_binom(arr.copy(), close)',
setup=setup,
number=10,
))
if __name__ == '__main__':
print('N=10_000, close=1/N')
benchmark(10_000, 1/10_000)
print('N=100, close=1')
benchmark(100, 1)
# N=10_000, close=1/N
# Original:
# 14.855983458999999
# Quadratic:
# 0.35902471400000024
# Binomial:
# 0.7207887170000014
# N=100, close=1
# Original:
# 4.132993569
# Quadratic:
# 0.11140068399999947
# Binomial:
# 0.007650813999998007
下表显示了每次迭代中,我们需要在二次算法中查看的对数如何减少。
n_inds = 39967; n_dups = 23273
n_inds = 25304; n_dups = 14663
n_inds = 16032; n_dups = 9272
n_inds = 10204; n_dups = 5828
n_inds = 6503; n_dups = 3701
n_inds = 4156; n_dups = 2347
n_inds = 2675; n_dups = 1481
n_inds = 1747; n_dups = 928
n_inds = 1135; n_dups = 612
n_inds = 741; n_dups = 394
n_inds = 495; n_dups = 246
n_inds = 327; n_dups = 168
n_inds = 219; n_dups = 108
n_inds = 145; n_dups = 74
n_inds = 95; n_dups = 50
n_inds = 66; n_dups = 29
n_inds = 48; n_dups = 18
n_inds = 36; n_dups = 12
n_inds = 26; n_dups = 10
n_inds = 20; n_dups = 6
n_inds = 15; n_dups = 5
n_inds = 10; n_dups = 5
n_inds = 6; n_dups = 4
n_inds = 3; n_dups = 3
n_inds = 1; n_dups = 2
n_inds = 0; n_dups = 1
答案 2 :(得分:2)
测试用于排序的唯一值输入数组的不同算法的性能(下面附有代码)。功能:
使用close = 1 / arr.size
。
使用arr.size == 1_000
; close
是间隔长度。
"""Performance plots.
Assuming a sorted, unique-valued array as an input.
Function names have format `a<id>_*` where <id> is the answer's id."""
from timeit import timeit
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import perfplot
from scipy.stats import binom
from scipy.special import comb
def OP_reshape_arr(a, close):
flag = True
while flag:
array = a.sort_values().unique()
l = len(array)
flag = False
for i in range(l):
previous_item = next_item = None
if i > 0:
previous_item = array[i - 1]
if i < (l - 1):
next_item = array[i + 1]
if previous_item is not None:
if abs(array[i] - previous_item) < close:
average = (array[i] + previous_item) / 2
flag = True
a.replace(previous_item, value=average, inplace=True)
a.replace(array[i], value=average, inplace=True)
if next_item is not None:
if abs(next_item - array[i]) < close:
flag = True
average = (array[i] + next_item) / 2
a.replace(array[i], value=average, inplace=True)
a.replace(next_item, value=average, inplace=True)
return a.unique()
def _binom_mean(arr, scipy_cutoff=64):
n = arr.shape[0]
if arr.shape[0] == 1:
return arr[0]
elif n < scipy_cutoff:
return np.average(arr, weights=comb(n-1, np.arange(n), exact=False))
else:
f = binom(n-1, 0.5).pmf
return np.average(arr, weights=f(np.arange(n)))
def a57438948_reshape_arr_binom(arr, close):
d = np.ediff1d(arr, to_begin=0) < close
close_chunks = np.split(arr, np.where(~d)[0])
return np.fromiter(
(_binom_mean(c) for c in close_chunks),
dtype=np.float
)
def _custom_avg(arr, indices, close):
new_indices = list()
last = indices[-1]
for i in indices:
if arr[i] - arr[i-1] < close:
new_indices.append(i)
avg = (arr[i-1] + arr[i]) / 2
arr[i-1] = avg
if i != last and arr[i+1] - arr[i] >= close:
arr[i] = avg
return new_indices
def _filter_indices(indices):
new_indices = list()
second_dups = list()
if not indices:
return new_indices, second_dups
for i, j in zip(indices, indices[1:]):
if i + 1 == j:
new_indices.append(i)
else:
second_dups.append(i)
second_dups.append(indices[-1])
return new_indices, second_dups
def a57438948_reshape_arr_(arr, close):
indices = range(1, len(arr))
dup_mask = np.zeros(arr.shape, bool)
while indices:
indices, second_dups = _filter_indices(_custom_avg(arr, indices, close))
dup_mask[second_dups] = True
return np.unique(arr[~dup_mask])
def a57438149_solve_sorted(a, close):
while True:
comp = a[1:] - a[:-1] < close
if not comp.sum():
break
indices = np.tile(comp.nonzero()[0][:, None], (1, 2))
indices[:, 1] += 1
avg = a[indices].mean(axis=1)
a[indices.ravel()] = np.repeat(avg, 2)
a = np.unique(a)
return a
np.random.seed(0)
a = np.unique(np.random.rand(10_000))
c = 1/a.size
ref = OP_reshape_arr(pd.Series(a.copy()), c)
test = [
a57438948_reshape_arr_binom(a.copy(), c),
a57438948_reshape_arr_(a.copy(), c),
a57438149_solve_sorted(a, c),
]
assert all(x.shape == ref.shape and np.allclose(x, ref) for x in test)
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
perfplot.bench(
setup = lambda n: np.random.seed(0) or (np.unique(np.random.rand(n)), 1/n),
kernels=[
lambda x: OP_reshape_arr(pd.Series(x[0].copy()), x[1]),
lambda x: a57438948_reshape_arr_binom(x[0].copy(), x[1]),
lambda x: a57438948_reshape_arr_(x[0].copy(), x[1]),
lambda x: a57438149_solve_sorted(x[0], x[1]),
],
labels=['OP_reshape_arr', 'reshape_arr_binom', 'reshape_arr_', 'solve_sorted'],
n_range=np.logspace(2, 4, 8).astype(int),
xlabel='size of initial array (before np.unique; using interval length of 1/n)',
logx=True,
logy=True,
colors=colors,
automatic_order=False,
).plot()
plt.gca().set_xlim([1e2, 1e4])
plt.gca().set_ylim([1e-4, 20])
plt.savefig('scaling_with_array_size.png')
plt.close()
np.random.seed(0)
a = np.unique(np.random.rand(1_000_000))
c = 1/a.size
test = [
a57438948_reshape_arr_binom(a.copy(), c),
a57438948_reshape_arr_(a.copy(), c),
a57438149_solve_sorted(a, c),
]
assert all(x.shape == test[0].shape and np.allclose(x, test[0]) for x in test)
perfplot.bench(
setup = lambda n: np.random.seed(0) or (np.unique(np.random.rand(n)), 1/n),
kernels=[
lambda x: a57438948_reshape_arr_binom(x[0].copy(), x[1]),
lambda x: a57438948_reshape_arr_(x[0].copy(), x[1]),
lambda x: a57438149_solve_sorted(x[0], x[1]),
],
labels=['reshape_arr_binom', 'reshape_arr_', 'solve_sorted'],
n_range=np.logspace(4, 6, 5).astype(int),
xlabel='size of initial array (before np.unique; using interval length of 1/n)',
logx=True,
logy=True,
colors=colors[1:],
automatic_order=False,
).plot()
plt.gca().set_xlim([1e4, 1e6])
plt.gca().set_ylim([5e-4, 10])
plt.savefig('scaling_with_array_size_2.png')
plt.close()
perfplot.bench(
setup = lambda n: np.random.seed(0) or (np.unique(np.random.rand(1_000)), n),
kernels=[
lambda x: OP_reshape_arr(pd.Series(x[0].copy()), x[1]),
lambda x: a57438948_reshape_arr_binom(x[0].copy(), x[1]),
lambda x: a57438948_reshape_arr_(x[0].copy(), x[1]),
lambda x: a57438149_solve_sorted(x[0], x[1]),
],
labels=['OP_reshape_arr', 'reshape_arr_binom', 'reshape_arr_', 'solve_sorted'],
n_range=np.logspace(-6, -2, 16),
xlabel='length of interval (using array of size 1,000)',
logx=True,
logy=True,
colors=colors,
automatic_order=False,
).plot()
plt.gca().set_xlim([1e-6, 1e-2])
plt.gca().set_ylim([2e-5, 1e3])
plt.savefig('scaling_with_interval_length.png')
plt.close()