def euclidean_distance_square(x1, x2):
return -2*np.dot(x1, x2.T) + np.expand_dims(np.sum(np.square(x1), axis=1), axis=1) + np.sum(np.square(x2), axis=1)
def euclidean_distance_square(x1, x2):
return -2*np.dot(x1, x2.T) + np.expand_dims(np.sum(np.square(x1), axis=1), axis=1) + np.sum(np.square(x2), axis=1)
我的x1是尺寸矩阵(1,512),x2是尺寸矩阵(3000000,512)。 Numba可以慢得多,这很奇怪。我使用它错了吗?
答案 0 :(得分:15)
import numba as nb
def sum_squares_2d_array_along_axis1(arr):
res = np.empty(arr.shape[0], dtype=arr.dtype)
for o_idx in range(arr.shape[0]):
sum_ = 0
for i_idx in range(arr.shape[1]):
sum_ += arr[o_idx, i_idx] * arr[o_idx, i_idx]
res[o_idx] = sum_
return res
def euclidean_distance_square_numba_v1(x1, x2):
return -2 * np.dot(x1, x2.T) + np.expand_dims(sum_squares_2d_array_along_axis1(x1), axis=1) + sum_squares_2d_array_along_axis1(x2)
import numba as nb
def euclidean_distance_square_numba_v2(x1, x2):
f1 = 0.
for i_idx in range(x1.shape[1]):
f1 += x1[0, i_idx] * x1[0, i_idx]
res = np.empty(x2.shape[0], dtype=x2.dtype)
for o_idx in range(x2.shape[0]):
val = 0
for i_idx in range(x2.shape[1]):
val_from_x2 = x2[o_idx, i_idx]
val += (-2) * x1[0, i_idx] * val_from_x2 + val_from_x2 * val_from_x2
val += f1
res[o_idx] = val
return res
import numba as nb
def euclidean_distance_square_numba_v3(x1, x2):
res = np.empty(x2.shape[0], dtype=x2.dtype)
for o_idx in range(x2.shape[0]):
val = 0
for i_idx in range(x2.shape[1]):
tmp = x1[0, i_idx] - x2[o_idx, i_idx]
val += tmp * tmp
res[o_idx] = val
return res
from scipy.spatial import distance
distance.cdist(x1, x2, metric='sqeuclidean')
x1 = np.array([[1.,2,3]])
x2 = np.array([[1.,2,3], [2,3,4], [3,4,5], [4,5,6], [5,6,7]])
res1 = euclidean_distance_square(x1, x2)
res2 = euclidean_distance_square_numba_original(x1, x2)
res3 = euclidean_distance_square_numba_v1(x1, x2)
res4 = euclidean_distance_square_numba_v2(x1, x2)
res5 = euclidean_distance_square_numba_v3(x1, x2)
np.testing.assert_array_equal(res1, res2)
np.testing.assert_array_equal(res1, res3)
np.testing.assert_array_equal(res1[0], res4)
np.testing.assert_array_equal(res1[0], res5)
np.testing.assert_almost_equal(res1, distance.cdist(x1, x2, metric='sqeuclidean'))
x1 = np.random.random((1, 512))
x2 = np.random.random((1000000, 512))
%timeit euclidean_distance_square(x1, x2)
# 2.09 s ± 54.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit euclidean_distance_square_numba_original(x1, x2)
# 10.9 s ± 158 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit euclidean_distance_square_numba_v1(x1, x2)
# 907 ms ± 7.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit euclidean_distance_square_numba_v2(x1, x2)
# 715 ms ± 15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit euclidean_distance_square_numba_v3(x1, x2)
# 731 ms ± 34.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit distance.cdist(x1, x2, metric='sqeuclidean')
# 706 ms ± 4.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
答案 1 :(得分:9)
这是对@MSeifert回答的评论。 还有一些事情可以获得表现。与每个数字代码一样,建议考虑哪种数据类型足以满足您的问题。通常float32也足够了,有时甚至float64也不够。
对于一个简单的求和,我查看了LLVM代码,发现sumation在向量化的部分和中被分割。 (4个部分和为double,8个为浮点数,使用AVX2)。这必须进一步调查。
import llvmlite.binding as llvm
llvm.set_option('', '--debug-only=loop-vectorize')
def euclidean_distance_square_numba_v3(x1, x2):
res = np.empty(x2.shape[0], dtype=x2.dtype)
for o_idx in range(x2.shape[0]):
val = 0
for i_idx in range(x2.shape[1]):
tmp = x1[0, i_idx] - x2[o_idx, i_idx]
val += tmp * tmp
res[o_idx] = val
return res
def euclidean_distance_square_numba_v4(x1, x2):
res = np.empty(x2.shape[0], dtype=x2.dtype)
for o_idx in range(x2.shape[0]):
val = 0.
for i_idx in range(x2.shape[1]):
tmp = x1[0, i_idx] - x2[o_idx, i_idx]
val += tmp * tmp
res[o_idx] = val
return res
def euclidean_distance_square_numba_v5(x1, x2):
res = np.empty(x2.shape[0], dtype=x2.dtype)
for o_idx in nb.prange(x2.shape[0]):
val = 0.
for i_idx in range(x2.shape[1]):
tmp = x1[0, i_idx] - x2[o_idx, i_idx]
val += tmp * tmp
res[o_idx] = val
return res
x1 = np.random.random((1, 512))
x2 = np.random.random((1000000, 512))
0.42 v3 @MSeifert
0.25 v4
0.18 v5 parallel-version
0.48 distance.cdist
x1 = np.random.random((1, 512)).astype(np.float32)
x2 = np.random.random((1000000, 512)).astype(np.float32)
0.09 v5
@nb.njit('double[:](double[:, ::1],double[:, ::1])',fastmath=True)
def euclidean_distance_square_numba_v6(x1, x2):
res = np.empty(x2.shape[0], dtype=x2.dtype)
for o_idx in range(x2.shape[0]):
val = 0.
for i_idx in range(x2.shape[1]):
tmp = x1[0, i_idx] - x2[o_idx, i_idx]
val += tmp * tmp
res[o_idx] = val
return res
这提供了与v4版本相同的性能,但如果输入数组不是C-contigous或不是dtype = np.float64,则会失败。
@nb.njit('double[:](double[:, :],double[:, :])',fastmath=True)
def euclidean_distance_square_numba_v7(x1, x2):
res = np.empty(x2.shape[0], dtype=x2.dtype)
for o_idx in range(x2.shape[0]):
val = 0.
for i_idx in range(x2.shape[1]):
tmp = x1[0, i_idx] - x2[o_idx, i_idx]
val += tmp * tmp
res[o_idx] = val
return res
这也适用于跨步数组,但是比C-contigous数组上的版本要慢得多。 ( 0.66s vs. 0.25s )。请注意,您的问题受内存带宽的限制。 CPU绑定计算的差异可能更大。
答案 2 :(得分:7)
尽管事实上,@ MSeifert的答案使得这个答案相当陈旧,但我仍然发布它,因为它更详细地解释了为什么numba版本比numpy版本慢。
import numpy as np
import numba as nb
def just_sum(x2):
return np.sum(x2, axis=1)
@nb.jit('double[:](double[:, :])', nopython=True)
def nb_just_sum(x2):
return np.sum(x2, axis=1)
>>> %timeit just_sum(x)
2.33 ms ± 71.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
>>> %timeit nb_just_sum(x)
33.7 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
在使用注释(例如numba --annotate-html sum.html numba_sum.py
这种方法有什么问题?内存布局!数组存储在行主顺序中,因此按行读取它会导致比按行读取更多的缓存未命中(这就是numpy所做的)。 a great article解释了可能的缓存效果。
>>> %timeit just_sum(x.T)
3.09 ms ± 66.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
>>> %timeit nb_just_sum(x.T)
3.58 ms ± 45.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
正如@MSeifert的代码所示,numba的主要优点是,在它的帮助下我们可以减少临时numpy数组的数量。然而,一些看起来容易的事情并不容易,一个天真的解决方案可能会非常糟糕。建立一个总和就是这样一个操作 - 不应该认为一个简单的循环足够好 - 参见例如this question。
Function name: array_sum_impl_axis
in file: /home/ed/anaconda3/lib/python3.6/site-packages/numba/targets/arraymath.py
with signature: (array(float64, 2d, A), int64) -> array(float64, 1d, C)
show numba IR
194: def array_sum_impl_axis(arr, axis):
195: ndim = arr.ndim
197: if not is_axis_const:
198: # Catch where axis is negative or greater than 3.
199: if axis < 0 or axis > 3:
200: raise ValueError("Numba does not support sum with axis"
201: "parameter outside the range 0 to 3.")
203: # Catch the case where the user misspecifies the axis to be
204: # more than the number of the array's dimensions.
205: if axis >= ndim:
206: raise ValueError("axis is out of bounds for array")
208: # Convert the shape of the input array to a list.
209: ashape = list(arr.shape)
210: # Get the length of the axis dimension.
211: axis_len = ashape[axis]
212: # Remove the axis dimension from the list of dimensional lengths.
213: ashape.pop(axis)
214: # Convert this shape list back to a tuple using above intrinsic.
215: ashape_without_axis = _create_tuple_result_shape(ashape, arr.shape)
216: # Tuple needed here to create output array with correct size.
217: result = np.full(ashape_without_axis, zero, type(zero))
219: # Iterate through the axis dimension.
220: for axis_index in range(axis_len):
221: if is_axis_const:
222: # constant specialized version works for any valid axis value
223: index_tuple_generic = _gen_index_tuple(arr.shape, axis_index,
224: const_axis_val)
225: result += arr[index_tuple_generic]
226: else:
227: # Generate a tuple used to index the input array.
228: # The tuple is ":" in all dimensions except the axis
229: # dimension where it is "axis_index".
230: if axis == 0:
231: index_tuple1 = _gen_index_tuple(arr.shape, axis_index, 0)
232: result += arr[index_tuple1]
233: elif axis == 1:
234: index_tuple2 = _gen_index_tuple(arr.shape, axis_index, 1)
235: result += arr[index_tuple2]
236: elif axis == 2:
237: index_tuple3 = _gen_index_tuple(arr.shape, axis_index, 2)
238: result += arr[index_tuple3]
239: elif axis == 3:
240: index_tuple4 = _gen_index_tuple(arr.shape, axis_index, 3)
241: result += arr[index_tuple4]
243: return result