我一直在使用Numba
包测试一些基本的CUDA函数。我的主要目标是在GPU上实现Richardson-Lucy算法。可以加速算法,并且可以在以下虚函数
def dummy(arr1, arr2):
return (arr1 * arr2).sum() / ((arr2**2).sum() + eps)
这个功能在CPU上运行得相当快,但是我想把所有东西放在GPU上以避免主机< --->设备副本。
为了比较不同计算的速度,我写了一小段函数:
import numpy as np
from numba import njit, jit
import numba
import numba.cuda as cuda
import timeit
import time
# define our functions
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def add_gpu(a, b):
return a + b
@numba.vectorize(["float32(float32, float32)", "float64(float64, float64)"], target="cuda")
def mult_gpu(a, b):
return a * b
@cuda.reduce
def sum_gpu(a, b):
return a + b
@cuda.jit
def add_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] + b[x]
@cuda.jit
def mult_gpu_1d(a, b, c):
x = cuda.grid(1)
if x < c.size:
c[x] = a[x] * b[x]
@cuda.jit
def mult_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] * b[x, y]
@cuda.jit
def add_gpu_2d(a, b, c):
x, y = cuda.grid(2)
if x < c.shape[0] and y < c.shape[1]:
c[x, y] = a[x, y] + b[x, y]
和一些计时器功能:
def avg_t(t, num):
return np.mean(t) / num
def format_t(t):
"""Turn t into nice formating"""
if t < 1e-3:
return "{:.1f} us".format(t * 1e6)
elif t < 1:
return "{:.1f} ms".format(t * 1e3)
else:
return "{:.1f} s".format(t)
def test_1d_times(data_len, dtype=np.float32):
num_times = 10
title = "Testing 1D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")
t = time.time()
arr1, arr2 = np.empty((2, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in " + format_t(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)
print()
print("Testing multiplication times")
print("----------------------------")
t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: mult_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_1d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing sum times")
print("------------------")
t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: add_gpu_1d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_1d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing reduction times")
print("-----------------------")
t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu.reduce(d_arr1)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: sum_gpu(d_arr1)), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()
def test_2d_times(data_len, dtype=np.float32):
num_times = 10
title = "Testing 2D Data, Data length = {}, data type = {}".format(data_len, dtype)
print(len(title) * "=")
print(title)
print(len(title) * "=")
t = time.time()
arr1, arr2 = np.empty((2, data_len, data_len), dtype=dtype)
d_arr1 = cuda.to_device(arr1)
d_arr2 = cuda.to_device(arr2)
d_result = cuda.device_array_like(d_arr1)
print("Data generated in {} seconds".format(time.time() - t))
print("d_arr1 dtype =", d_arr1.dtype)
print("d_arr1 size = ", d_arr1.size)
print()
print("Testing multiplication times")
print("----------------------------")
t = timeit.repeat((lambda: arr1 * arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: mult_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: mult_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_mult_2d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing sum times")
print("------------------")
t = timeit.repeat((lambda: arr1 + arr2), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu(d_arr1, d_arr2)), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t= timeit.repeat((lambda: add_gpu_2d(d_arr1, d_arr2, d_result)), number=num_times)
print("cuda_add_2d time = " + format_t(avg_t(t, num_times)))
print()
print("Testing reduction times")
print("-----------------------")
t = timeit.repeat((lambda: arr1.sum()), number=num_times)
print("cpu/numpy time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: add_gpu.reduce(d_arr1.ravel())), number=num_times)
print("cuda vectorize time = " + format_t(avg_t(t, num_times)))
t = timeit.repeat((lambda: sum_gpu(d_arr1.ravel())), number=num_times)
print("sum_gpu time = " + format_t(avg_t(t, num_times)))
print()
运行测试功能
numba.cuda.detect()
test_1d_times(2**24)
test_2d_times(2**12)
test_1d_times(2**24, dtype=np.float64)
test_2d_times(2**12, dtype=np.float64)
给出以下输出:
Found 1 CUDA devices
id 0 b'GeForce GTX TITAN X' [SUPPORTED]
compute capability: 5.2
pci device id: 0
pci bus id: 3
Summary:
1/1 devices are supported
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float32'>
============================================================================
Data generated in 88.2 ms
d_arr1 dtype = float32
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 122.8 ms
cuda_mult_1d time = 206.8 us
Testing sum times
------------------
cpu/numpy time = 35.8 ms
cuda vectorize time = 106.1 ms
cuda_add_1d time = 212.6 us
Testing reduction times
-----------------------
cpu/numpy time = 16.7 ms
cuda vectorize time = 11.1 ms
sum_gpu time = 127.3 ms
========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float32'>
========================================================================
Data generated in 0.0800013542175293 seconds
d_arr1 dtype = float32
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 35.4 ms
cuda vectorize time = 97.9 ms
cuda_mult_2d time = 208.9 us
Testing sum times
------------------
cpu/numpy time = 36.3 ms
cuda vectorize time = 94.5 ms
cuda_add_2d time = 250.8 us
Testing reduction times
-----------------------
cpu/numpy time = 16.4 ms
cuda vectorize time = 15.8 ms
sum_gpu time = 125.4 ms
============================================================================
Testing 1D Data, Data length = 16777216, data type = <class 'numpy.float64'>
============================================================================
Data generated in 171.0 ms
d_arr1 dtype = float64
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 73.2 ms
cuda vectorize time = 114.9 ms
cuda_mult_1d time = 201.9 us
Testing sum times
------------------
cpu/numpy time = 71.4 ms
cuda vectorize time = 71.0 ms
cuda_add_1d time = 217.2 us
Testing reduction times
-----------------------
cpu/numpy time = 29.0 ms
cuda vectorize time = 12.8 ms
sum_gpu time = 123.5 ms
========================================================================
Testing 2D Data, Data length = 4096, data type = <class 'numpy.float64'>
========================================================================
Data generated in 0.301849365234375 seconds
d_arr1 dtype = float64
d_arr1 size = 16777216
Testing multiplication times
----------------------------
cpu/numpy time = 73.7 ms
cuda vectorize time = 84.2 ms
cuda_mult_2d time = 226.2 us
Testing sum times
------------------
cpu/numpy time = 74.9 ms
cuda vectorize time = 84.3 ms
cuda_add_2d time = 208.7 us
Testing reduction times
-----------------------
cpu/numpy time = 29.9 ms
cuda vectorize time = 14.3 ms
sum_gpu time = 121.2 ms
似乎@cuda.vectorize
修饰函数的执行速度比CPU和自定义编写的@cuda.jit
函数慢。虽然@cuda.jit
函数可以提供预期的数量级加速和几乎恒定的时间性能(结果未显示)。
另一方面,@cuda.reduce
函数的运行速度明显慢于@cuda.vectorize
函数或CPU函数。
@cuda.vectorize
和@cuda.reduce
功能表现不佳是否有原因?是否可以仅使用Numba编写CUDA减少内核?
编辑:
在[{1}}中看来这是一个合法的错误:https://github.com/numba/numba/issues/2266,https://github.com/numba/numba/issues/2268
答案 0 :(得分:2)
我无法解释<script type="text/javascript">
window.addEventListener('load', onVrViewLoad)
function onVrViewLoad() {
var vrView = new VRView.Player('#vrview', {
image: 'uploads/pano.jpg',
is_stereo: false
});
vrView.addHotspot('hotspot-one', {
pitch: 30, // In degrees. Up is positive.
yaw: 20, // In degrees. To the right is positive.
radius: 0.05, // Radius of the circular target in meters.
distance: 2, // Distance of target from camera in meters.
});
}
</script>
<div id="vrview">
</div>
和@cuda.vectorize
的行为。有时候结果对我来说有点奇怪。例如,此处Negative Speed Gain Using Numba Vectorize target='cuda' @cuda.reduce
会减慢计算速度,而使用@cuda.vectorize
可以加快计算速度。
在这里,我建议尝试PyCUDA(https://documen.tician.de/pycuda/)。我测试了点积(https://documen.tician.de/pycuda/array.html)的性能。
@cuda.jit
在我的电脑上import numpy as np
from pycuda.curandom import rand as curand
import pycuda.gpuarray as gpuarray
import pycuda.driver as pycu
import pycuda.autoinit
from pycuda.reduction import ReductionKernel
import numba.cuda as cuda
from time import time
dot = ReductionKernel(dtype_out=np.float32, neutral="0",
reduce_expr="a+b", map_expr="x[i]*y[i]",
arguments="float *x, float *y")
n = 2**24
x = curand((n), dtype=np.float32)
y = curand((n), dtype=np.float32)
x_cpu = np.random.random((n))
y_cpu = np.random.random((n))
st = time()
x_dot_y = dot(x, y).get()
gpu_time = (time() - st)
print "GPU: ", gpu_time
st = time()
x_dot_y_cpu = np.dot(x_cpu, y_cpu)
cpu_time = (time() - st)
print "CPU: ", cpu_time
print "speedup: ", cpu_time/gpu_time
我得到了以下结果:
CPU: Intel Core2 Quad 3GHz, GPU: NVIDIA GeForce GTX 580.
有必要注意,在上面的代码中没有考虑初始化和预编译内核所需的时间。但是,这个时间可能很重要。考虑到这段时间,我已经获得了:
GPU: 0.00191593170166
CPU: 0.0518710613251
speedup: 27.0735440518
因此,在这种情况下,GPU代码比CPU代码慢。 同时,对于大多数应用程序,您只需要初始化内核一次,然后多次使用它。在这种情况下,使用PyCUDA简化内核看起来是合理的。
之前,我通过计算2D扩散系数方程测试了GPU: 0.316560029984
CPU: 0.0511090755463
speedup: 0.161451449031
,PyCUDA和CUDA-C代码的性能。我发现PyCUDA可以获得与CUDA-C几乎相同的性能,而Numba表现出更差的性能。下图显示了这些结果。