对于我的研究,我必须在两组向量之间实现成对距离L1-距离计算,每组向量均表示为NumPy矩阵(向量为行)。必须使用两个循环(一个循环和无循环)来完成此操作。我预计,由于NumPy在矢量化方面是如此出色,因此算法的排名必须比不带循环的慢两个环,而比不带循环的慢。
我已经编写了函数:
def f_cdist_2(X1, X2):
res = np.zeros(shape=(X1.shape[0], X2.shape[0]), dtype=np.float64)
for ix1 in range(X1.shape[0]):
for ix2 in range(X2.shape[0]):
res[ix1, ix2] = np.abs(X1[ix1, :] - X2[ix2, :]).sum()
return res
def f_cdist_1(X1, X2):
res = np.zeros(shape=(X1.shape[0], X2.shape[0]), dtype=np.float64)
for ix1 in range(X1.shape[0]):
res[ix1, :] = np.abs(np.tile(X1[ix1, :], (X2.shape[0], 1)) - X2).sum(axis=1)
return res
def f_cdist_0(X1, X2):
res = np.abs(
np.tile(X1[:, :, np.newaxis], (1, 1, X2.shape[0])) - \
np.tile(X2.T[np.newaxis, :, :], (X1.shape[0], 1, 1))
).sum(axis=1)
return res
然后我基于100次运行,使用形状为128 x 512和256 x 512的两个随机矩阵测试了性能,得出了结果:
两个循环:156毫秒
一个循环:32毫秒
无循环:135毫秒
我还尝试了cdist
中的scipy.spatial.distance
,并获得了最佳性能:9毫秒。
现在,有没有更好的方法来实现无循环功能?我希望它的性能至少与单循环一样好,但是现在我对如何改进它一无所知。
更新
使用 kwinkunks 的无循环方法实施,我已经在1024 x 1024矩阵上重新运行测试(再次进行100次试验),结果如下:
两个循环:5.7秒
一个循环:6.6秒
无循环:3.9秒
scipy.spatial.distance.cdist
:0.6秒
因此,在更大的矩阵上,无循环实现的确更好。 scipy
令人惊奇,但如果我理解正确,它是用C编写的,因此性能非常好。
更新
尝试使用4096 x 1024的np.float64
矩阵,设置相同:
两个循环:88秒
一个循环:66秒
无循环:内存不足(目前有〜18 Gb的可用RAM)
scipy.spatial.distance.cdist
:13秒
答案 0 :(得分:4)
您可以使用Pythran
从矢量化版本中获得额外的加速f_dist.py:
import numpy as np
#pythran export f_dist(float64[:,:], float64[:,:])
def f_dist(X1, X2):
return np.sum(np.abs(X1[:, None, :] - X2), axis=-1)
在我的笔记本电脑上,原始版本运行于:
> python -m timeit -s 'from f_dist import f_dist; from numpy.random import random; x = random((100,100)); y = random((100,100))' 'f_dist(x, y)'
100 loops, best of 3: 7.05 msec per loop
一旦编译内核:
> pythran f_dist.py
您可以对其进行基准测试:
> python -m timeit -s 'from f_dist import f_dist; from numpy.random import random; x = random((100,100)); y = random((100,100))' 'f_dist(x, y)'
1000 loops, best of 3: 1.21 msec per loop
使用SIMD指令可以进一步加快计算速度:
> pythran f_dist.py -DUSE_XSIMD -march=native
> python -m timeit -s 'from f_dist import f_dist; from numpy.random import random; x = random((100,100)); y = random((100,100))' 'f_dist(x, y)'
1000 loops, best of 3: 774 usec per loop
免责声明:我是pythran项目的核心开发人员。
答案 1 :(得分:0)
通过NumPy的广播,您可以避免平铺等操作
def f_dist(X1, X2):
return np.sum(np.abs(X1[:, None, :] - X2), axis=-1)
但是,令人惊讶的是(无论如何对我来说),它并不比循环快(我的机器上大约90毫秒,而f_cdist_1()
函数则为24毫秒)。
该广播技巧通常很有用。这意味着您可以执行以下操作:
>>> np.array([1,2,3]) * np.array([10, 20, 30])[:, None]
array([[10, 20, 30],
[20, 40, 60],
[30, 60, 90]])
答案 2 :(得分:0)
Exmaple
import numpy as np
import numba as nb
#Debug output for SIMD-vectorization
import llvmlite.binding as llvm
llvm.set_option('', '--debug-only=loop-vectorize')
########################################
#Your solution
#You can also use Numba on this, but apart from parallization
#it is often better to write out the inner loop
def f_cdist(X1, X2):
res = np.zeros(shape=(X1.shape[0], X2.shape[0]), dtype=np.float64)
for ix1 in range(X1.shape[0]):
for ix2 in range(X2.shape[0]):
res[ix1, ix2] = np.abs(X1[ix1, :] - X2[ix2, :]).sum()
return res
@nb.njit(fastmath=True,parallel=True)
def f_cdist_nb(X1, X2):
#Some safety, becuase there is no bounds-checking
assert X1.shape[1]==X2.shape[1]
res = np.empty(shape=(X1.shape[0], X2.shape[0]), dtype=X1.dtype)
for ix1 in nb.prange(X1.shape[0]):
for ix2 in range(X2.shape[0]):
#Writing out the inner loop often leads to better performance
sum=0.
for i in range(X1.shape[1]):
sum+=np.abs(X1[ix1, i] - X2[ix2, i])
res[ix1, ix2] = sum
return res
性能
from scipy import spatial
#4096x1024
X1=np.random.rand(4096,1024)
X2=np.random.rand(4096,1024)
res1=f_cdist_nb(X1,X2)
res2=f_cdist(X1,X2)
res3=spatial.distance.cdist(X1, X2, 'cityblock')
#Check the results
np.allclose(res1,res2)
True
np.allclose(res1,res3)
True
%timeit res1=f_cdist_nb(X1,X2)
1.38 s ± 64.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=f_cdist(X1,X2)
1min 25s ± 483 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res3=spatial.distance.cdist(X1, X2, 'cityblock')
17.6 s ± 18.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#1024x1024
X1=np.random.rand(1024,1024)
X2=np.random.rand(1024,1024)
%timeit res1=f_cdist_nb(X1,X2)
63.5 ms ± 3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit res3=spatial.distance.cdist(X1, X2, 'cityblock')
1.09 s ± 3.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
#512x512
X1=np.random.rand(512,512)
X2=np.random.rand(512,512)
%timeit res1=f_cdist_nb(X1,X2)
4.91 ms ± 280 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit res3=spatial.distance.cdist(X1, X2, 'cityblock')
130 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
编辑:手动优化的Numba版本
#Unroll and Jam loops
@nb.njit(fastmath=True,parallel=True)
def f_cdist_nb_3(X1, X2):
assert X1.shape[1]==X2.shape[1]
res = np.empty(shape=(X1.shape[0], X2.shape[0]), dtype=X1.dtype)
for ix1 in nb.prange(X1.shape[0]//4):
for ix2 in range(X2.shape[0]//4):
sum_1,sum_2,sum_3,sum_4,sum_5,sum_6 =0.,0.,0.,0.,0.,0.
sum_7,sum_8,sum_9,sum_10,sum_11,sum_12=0.,0.,0.,0.,0.,0.
sum_13,sum_14,sum_15,sum_16=0.,0.,0.,0.
for i in range(X1.shape[1]):
sum_1+=np.abs(X1[ix1*4+0, i] - X2[ix2*4+0, i])
sum_2+=np.abs(X1[ix1*4+0, i] - X2[ix2*4+1, i])
sum_3+=np.abs(X1[ix1*4+0, i] - X2[ix2*4+2, i])
sum_4+=np.abs(X1[ix1*4+0, i] - X2[ix2*4+3, i])
sum_5+=np.abs(X1[ix1*4+1, i] - X2[ix2*4+0, i])
sum_6+=np.abs(X1[ix1*4+1, i] - X2[ix2*4+1, i])
sum_7+=np.abs(X1[ix1*4+1, i] - X2[ix2*4+2, i])
sum_8+=np.abs(X1[ix1*4+1, i] - X2[ix2*4+3, i])
sum_9+=np.abs(X1[ix1*4+2, i] - X2[ix2*4+0, i])
sum_10+=np.abs(X1[ix1*4+2, i] - X2[ix2*4+1, i])
sum_11+=np.abs(X1[ix1*4+2, i] - X2[ix2*4+2, i])
sum_12+=np.abs(X1[ix1*4+2, i] - X2[ix2*4+3, i])
sum_13+=np.abs(X1[ix1*4+3, i] - X2[ix2*4+0, i])
sum_14+=np.abs(X1[ix1*4+3, i] - X2[ix2*4+1, i])
sum_15+=np.abs(X1[ix1*4+3, i] - X2[ix2*4+2, i])
sum_16+=np.abs(X1[ix1*4+3, i] - X2[ix2*4+3, i])
res[ix1*4+0, ix2*4+0] = sum_1
res[ix1*4+0, ix2*4+1] = sum_2
res[ix1*4+0, ix2*4+2] = sum_3
res[ix1*4+0, ix2*4+3] = sum_4
res[ix1*4+1, ix2*4+0] = sum_5
res[ix1*4+1, ix2*4+1] = sum_6
res[ix1*4+1, ix2*4+2] = sum_7
res[ix1*4+1, ix2*4+3] = sum_8
res[ix1*4+2, ix2*4+0] = sum_9
res[ix1*4+2, ix2*4+1] = sum_10
res[ix1*4+2, ix2*4+2] = sum_11
res[ix1*4+2, ix2*4+3] = sum_12
res[ix1*4+3, ix2*4+0] = sum_13
res[ix1*4+3, ix2*4+1] = sum_14
res[ix1*4+3, ix2*4+2] = sum_15
res[ix1*4+3, ix2*4+3] = sum_16
#Rest of the loop
for ix1 in range(X1.shape[0]//4*4,X1.shape[0]):
for ix2 in range(X2.shape[0]):
sum_1=0.
for i in range(X1.shape[1]):
sum_1+=np.abs(X1[ix1, i] - X2[ix2, i])
res[ix1, ix2] = sum_1
for ix1 in range(X1.shape[0]):
for ix2 in range(X2.shape[0]//4*4,X2.shape[0]):
sum_1=0.
for i in range(X1.shape[1]):
sum_1+=np.abs(X1[ix1, i] - X2[ix2, i])
res[ix1, ix2] = sum_1
return res
时间
#4096x1024
X1=np.random.rand(4096,1024)
X2=np.random.rand(4096,1024)
res1=f_cdist_nb(X1,X2)
res2=f_cdist_nb_3(X1,X2)
res3=spatial.distance.cdist(X1, X2, 'cityblock')
#Check the results
print(np.allclose(res1,res2))
print(np.allclose(res1,res3))
%timeit res1=f_cdist_nb(X1,X2)
1.6 s ± 199 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res2=f_cdist_nb_3(X1,X2)
497 ms ± 50.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res3=spatial.distance.cdist(X1, X2, 'cityblock')
17.7 s ± 118 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)