我想将数组的一个切片传递给设备函数,然后在其中创建一些新的数组并返回它们的组合。但这似乎不是解决cuda问题的通用方法,因为numba不支持设备函数内部的列表和数组。因此,我应该如何规避此限制?
import numpy as np
from numba import cuda
from numba import *
@cuda.jit(device=True)
def comp_array_2(y): # y is a 3d vector
"""Returns y + 1"""
new_arr = y
mod_arr = new_arr+1
return mod_arr
@cuda.jit(device=True)
def comp_array_1(y): # y is a 3d vector
"""Returns y*(y + 1)"""
new_arr = y
mod_arr = comp_array_2(new_arr)
return [mod_arr[0]*new_arr[0],
mod_arr[1]*new_arr[1],
mod_arr[2]*new_arr[2]]
@cuda.jit
def euler_sol_cuda(init, sol):
start = cuda.grid(1)
stride = cuda.gridsize(1)
# Threads loop
for i in range(start, init.shape[0], stride):
sol[i] = comp_array_1(init[i]) #
if __name__=="__main__":
# CPU
n = 128*1024
blocks_per_grid = 32
threads_per_block = 64
# To GPU
d_init = cuda.to_device(np.random.rand(n,3))
d_sol = cuda.device_array_like(d_init)
euler_sol_cuda[blocks_per_grid, threads_per_block](d_init, d_sol)
print(sol_d.copy_to_host())
所需的输出是大小为(2 ^ 17,3)的数组,类似于numpy 顺序代码:
d_init = np.random.rand(5,3)
print(d_init)
d_init * (d_init + 1)
输出
`[[0.44905369 0.28315775 0.06539836]
[0.71547266 0.77073541 0.76203624]
[0.82483726 0.14611916 0.8317247 ]
[0.22490635 0.83327892 0.48090544]
[0.79825294 0.07037226 0.5455043 ]]`
`array([[0.6507029 , 0.36333606, 0.06967531],
[1.2273738 , 1.36476849, 1.34273547],
[1.50519376, 0.16746996, 1.52349068],
[0.27548921, 1.52763267, 0.71217548],
[1.43546068, 0.07532451, 0.84307924]])
`