Question

以下代码是否有任何理由在2s中运行，

def euclidean_distance_square(x1, x2):
    return -2*np.dot(x1, x2.T) + np.expand_dims(np.sum(np.square(x1), axis=1), axis=1) + np.sum(np.square(x2), axis=1)

而以下numba代码在12s运行？

@jit(nopython=True)
def euclidean_distance_square(x1, x2):
   return -2*np.dot(x1, x2.T) + np.expand_dims(np.sum(np.square(x1), axis=1), axis=1) + np.sum(np.square(x2), axis=1)

我的x1是尺寸矩阵（1,512），x2是尺寸矩阵（3000000,512）。 Numba可以慢得多，这很奇怪。我使用它错了吗？

我真的需要加快速度，因为我需要运行这个功能300万次而且2s仍然太慢。

我需要在CPU上运行它，因为你可以看到x2的维度是如此巨大，它无法加载到GPU（或至少我的GPU）上，没有足够的内存。

Answer 1

非常奇怪的是，numba可能会慢很多。

它并不太奇怪。当你在numba函数中调用NumPy函数时，你可以调用这些函数的numba版本。这些可以更快，更慢或与NumPy版本一样快。你可能很幸运，或者你可能不走运（你运气不好！）。但是即使在numba函数中，你仍然会创建很多临时函数，因为你使用了NumPy函数（一个临时数组用于点结果，一个用于每个正方形和总和，一个用于点加上第一个总和）所以你不需要numba的可能性优势。

我使用它错了吗？

基本上：是的。

我真的需要加快速度

好的，我试一试。

让我们从轴1调用中展开平方和开始：

import numba as nb

@nb.njit
def sum_squares_2d_array_along_axis1(arr):
    res = np.empty(arr.shape[0], dtype=arr.dtype)
    for o_idx in range(arr.shape[0]):
        sum_ = 0
        for i_idx in range(arr.shape[1]):
            sum_ += arr[o_idx, i_idx] * arr[o_idx, i_idx]
        res[o_idx] = sum_
    return res


@nb.njit
def euclidean_distance_square_numba_v1(x1, x2):
    return -2 * np.dot(x1, x2.T) + np.expand_dims(sum_squares_2d_array_along_axis1(x1), axis=1) + sum_squares_2d_array_along_axis1(x2)

在我的电脑上，它已经比NumPy代码快2倍，比原来的Numba代码快10倍。

从经验来看，它比NumPy快2倍通常是极限（至少如果NumPy版本没有不必要的复杂或低效），但是你可以通过展开一切来挤出更多：

import numba as nb

@nb.njit
def euclidean_distance_square_numba_v2(x1, x2):
    f1 = 0.
    for i_idx in range(x1.shape[1]):
        f1 += x1[0, i_idx] * x1[0, i_idx]

    res = np.empty(x2.shape[0], dtype=x2.dtype)
    for o_idx in range(x2.shape[0]):
        val = 0
        for i_idx in range(x2.shape[1]):
            val_from_x2 = x2[o_idx, i_idx]
            val += (-2) * x1[0, i_idx] * val_from_x2 + val_from_x2 * val_from_x2
        val += f1
        res[o_idx] = val
    return res

但与最新方法相比，这只能提高约10-20％。

此时你可能会意识到你可以简化代码（即使它可能会加速它）：

import numba as nb

@nb.njit
def euclidean_distance_square_numba_v3(x1, x2):
    res = np.empty(x2.shape[0], dtype=x2.dtype)
    for o_idx in range(x2.shape[0]):
        val = 0
        for i_idx in range(x2.shape[1]):
            tmp = x1[0, i_idx] - x2[o_idx, i_idx]
            val += tmp * tmp
        res[o_idx] = val
    return res

是的，这看起来非常直截了当，而且速度并不慢。

然而，在所有的兴奋中，我忘了提及明显的解决方案：scipy.spatial.distance.cdist，其中sqeuclidean（欧氏距离平方）选项：

from scipy.spatial import distance
distance.cdist(x1, x2, metric='sqeuclidean')

它并不比numba快，但它无需编写自己的功能即可使用......

测试

测试正确性并进行预热：

x1 = np.array([[1.,2,3]])
x2 = np.array([[1.,2,3], [2,3,4], [3,4,5], [4,5,6], [5,6,7]])

res1 = euclidean_distance_square(x1, x2)
res2 = euclidean_distance_square_numba_original(x1, x2)
res3 = euclidean_distance_square_numba_v1(x1, x2)
res4 = euclidean_distance_square_numba_v2(x1, x2)
res5 = euclidean_distance_square_numba_v3(x1, x2)
np.testing.assert_array_equal(res1, res2)
np.testing.assert_array_equal(res1, res3)
np.testing.assert_array_equal(res1[0], res4)
np.testing.assert_array_equal(res1[0], res5)
np.testing.assert_almost_equal(res1, distance.cdist(x1, x2, metric='sqeuclidean'))

时序：

x1 = np.random.random((1, 512))
x2 = np.random.random((1000000, 512))

%timeit euclidean_distance_square(x1, x2)
# 2.09 s ± 54.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit euclidean_distance_square_numba_original(x1, x2)
# 10.9 s ± 158 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit euclidean_distance_square_numba_v1(x1, x2)
# 907 ms ± 7.11 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit euclidean_distance_square_numba_v2(x1, x2)
# 715 ms ± 15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit euclidean_distance_square_numba_v3(x1, x2)
# 731 ms ± 34.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit distance.cdist(x1, x2, metric='sqeuclidean')
# 706 ms ± 4.99 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

注意：如果您有整数数组，则可能需要将numba函数中的硬编码0.0更改为0。

Answer 2

这是对@MSeifert回答的评论。还有一些事情可以获得表现。与每个数字代码一样，建议考虑哪种数据类型足以满足您的问题。通常float32也足够了，有时甚至float64也不够。

我还想在这里提到fastmath关键字，这里可以再提高1.7倍速度。

[编辑]

对于一个简单的求和，我查看了LLVM代码，发现sumation在向量化的部分和中被分割。（4个部分和为double，8个为浮点数，使用AVX2）。这必须进一步调查。

<强>代码

import llvmlite.binding as llvm
llvm.set_option('', '--debug-only=loop-vectorize')

@nb.njit
def euclidean_distance_square_numba_v3(x1, x2):
    res = np.empty(x2.shape[0], dtype=x2.dtype)
    for o_idx in range(x2.shape[0]):
        val = 0
        for i_idx in range(x2.shape[1]):
            tmp = x1[0, i_idx] - x2[o_idx, i_idx]
            val += tmp * tmp
        res[o_idx] = val
    return res

@nb.njit(fastmath=True)
def euclidean_distance_square_numba_v4(x1, x2):
    res = np.empty(x2.shape[0], dtype=x2.dtype)
    for o_idx in range(x2.shape[0]):
        val = 0.
        for i_idx in range(x2.shape[1]):
            tmp = x1[0, i_idx] - x2[o_idx, i_idx]
            val += tmp * tmp
        res[o_idx] = val
    return res

@nb.njit(fastmath=True,parallel=True)
def euclidean_distance_square_numba_v5(x1, x2):
    res = np.empty(x2.shape[0], dtype=x2.dtype)
    for o_idx in nb.prange(x2.shape[0]):
        val = 0.
        for i_idx in range(x2.shape[1]):
            tmp = x1[0, i_idx] - x2[o_idx, i_idx]
            val += tmp * tmp
        res[o_idx] = val
    return res

<强>计时

float64
x1 = np.random.random((1, 512))
x2 = np.random.random((1000000, 512))

0.42 v3 @MSeifert
0.25 v4
0.18 v5 parallel-version
0.48 distance.cdist

float32
x1 = np.random.random((1, 512)).astype(np.float32)
x2 = np.random.random((1000000, 512)).astype(np.float32)

0.09 v5

如何明确声明类型

总的来说，我不推荐这个。您的输入数组可以是C-contigous（作为testdata）Fortran连续或跨步。如果你知道你的数据总是C-contiguos你可以写

@nb.njit('double[:](double[:, ::1],double[:, ::1])',fastmath=True)
def euclidean_distance_square_numba_v6(x1, x2):
    res = np.empty(x2.shape[0], dtype=x2.dtype)
    for o_idx in range(x2.shape[0]):
        val = 0.
        for i_idx in range(x2.shape[1]):
            tmp = x1[0, i_idx] - x2[o_idx, i_idx]
            val += tmp * tmp
        res[o_idx] = val
    return res

这提供了与v4版本相同的性能，但如果输入数组不是C-contigous或不是dtype = np.float64，则会失败。

您也可以使用

@nb.njit('double[:](double[:, :],double[:, :])',fastmath=True)
def euclidean_distance_square_numba_v7(x1, x2):
    res = np.empty(x2.shape[0], dtype=x2.dtype)
    for o_idx in range(x2.shape[0]):
        val = 0.
        for i_idx in range(x2.shape[1]):
            tmp = x1[0, i_idx] - x2[o_idx, i_idx]
            val += tmp * tmp
        res[o_idx] = val
    return res

这也适用于跨步数组，但是比C-contigous数组上的版本要慢得多。（ 0.66s vs. 0.25s ）。请注意，您的问题受内存带宽的限制。 CPU绑定计算的差异可能更大。

如果你让Numba为你做这份工作，那么如果阵列是有条件的话会自动检测到它（在第一次尝试时提供相应的输入数据而不是非连续数据会导致重新编译）

Answer 3

尽管事实上，@ MSeifert的答案使得这个答案相当陈旧，但我仍然发布它，因为它更详细地解释了为什么numba版本比numpy版本慢。

正如我们将要看到的，主要的罪魁祸首是numpy和numba的不同内存访问模式。

我们可以用更简单的函数重现行为：

import numpy as np
import numba as nb

def just_sum(x2):
    return np.sum(x2, axis=1)

@nb.jit('double[:](double[:, :])', nopython=True)
def nb_just_sum(x2):
    return np.sum(x2, axis=1)

x2=np.random.random((2048,2048))

现在的时间安排：

>>> %timeit just_sum(x)
2.33 ms ± 71.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
>>> %timeit nb_just_sum(x)
33.7 ms ± 296 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

这意味着numpy快了大约15倍！

在使用注释（例如numba --annotate-html sum.html numba_sum.py）编译numba代码时，我们可以看到numba如何执行求和（参见附录中求和的完整列表）：

初始化结果列
将整个第一列添加到结果列
将整个第二列添加到结果列
等

这种方法有什么问题？内存布局！数组存储在行主顺序中，因此按行读取它会导致比按行读取更多的缓存未命中（这就是numpy所做的）。 a great article解释了可能的缓存效果。

正如我们所看到的，numba的总和实现还不是很成熟。但是，从上面的考虑，numba实现可能对列主序（即转置矩阵）具有竞争力：

>>> %timeit just_sum(x.T)
3.09 ms ± 66.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
>>> %timeit nb_just_sum(x.T)
3.58 ms ± 45.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

确实如此。

正如@MSeifert的代码所示，numba的主要优点是，在它的帮助下我们可以减少临时numpy数组的数量。然而，一些看起来容易的事情并不容易，一个天真的解决方案可能会非常糟糕。建立一个总和就是这样一个操作 - 不应该认为一个简单的循环足够好 - 参见例如this question。

列出numba-summation：

 Function name: array_sum_impl_axis
in file: /home/ed/anaconda3/lib/python3.6/site-packages/numba/targets/arraymath.py
with signature: (array(float64, 2d, A), int64) -> array(float64, 1d, C)
show numba IR
194:    def array_sum_impl_axis(arr, axis):
195:        ndim = arr.ndim
196:    
197:        if not is_axis_const:
198:            # Catch where axis is negative or greater than 3.
199:            if axis < 0 or axis > 3:
200:                raise ValueError("Numba does not support sum with axis"
201:                                 "parameter outside the range 0 to 3.")
202:    
203:        # Catch the case where the user misspecifies the axis to be
204:        # more than the number of the array's dimensions.
205:        if axis >= ndim:
206:            raise ValueError("axis is out of bounds for array")
207:    
208:        # Convert the shape of the input array to a list.
209:        ashape = list(arr.shape)
210:        # Get the length of the axis dimension.
211:        axis_len = ashape[axis]
212:        # Remove the axis dimension from the list of dimensional lengths.
213:        ashape.pop(axis)
214:        # Convert this shape list back to a tuple using above intrinsic.
215:        ashape_without_axis = _create_tuple_result_shape(ashape, arr.shape)
216:        # Tuple needed here to create output array with correct size.
217:        result = np.full(ashape_without_axis, zero, type(zero))
218:    
219:        # Iterate through the axis dimension.
220:        for axis_index in range(axis_len):
221:            if is_axis_const:
222:                # constant specialized version works for any valid axis value
223:                index_tuple_generic = _gen_index_tuple(arr.shape, axis_index,
224:                                                       const_axis_val)
225:                result += arr[index_tuple_generic]
226:            else:
227:                # Generate a tuple used to index the input array.
228:                # The tuple is ":" in all dimensions except the axis
229:                # dimension where it is "axis_index".
230:                if axis == 0:
231:                    index_tuple1 = _gen_index_tuple(arr.shape, axis_index, 0)
232:                    result += arr[index_tuple1]
233:                elif axis == 1:
234:                    index_tuple2 = _gen_index_tuple(arr.shape, axis_index, 1)
235:                    result += arr[index_tuple2]
236:                elif axis == 2:
237:                    index_tuple3 = _gen_index_tuple(arr.shape, axis_index, 2)
238:                    result += arr[index_tuple3]
239:                elif axis == 3:
240:                    index_tuple4 = _gen_index_tuple(arr.shape, axis_index, 3)
241:                    result += arr[index_tuple4]
242:    
243:        return result

为什么这个numba代码比numpy代码慢6倍？

3 个答案:

测试