Question

我是numba的新手，并试图加快我的monte carlo方法。我目前正在使用GeForce 950M在Ubuntu 14.04上工作。 CUDA版本是8.0.61。

当我尝试运行以下代码时，我从CUDA API获得了一些与内存相关的错误

代码：

@cuda.jit
def SIR(rng_states, y, particles, weight, beta, omega, gamma, 
    greater, equal, phi, phi_sub):
    # thread/block index for accessing data
    tx = cuda.threadIdx.x # Thread id in a 1D block = particle index
    ty = cuda.blockIdx.x # Block id in a 1D grid = event index
    bw = cuda.blockDim.x # Block width, i.e. number of threads per block = particle number
    pos = tx + ty * bw # computed flattened index inside the array

    # get current event y_t
    y_current = y[ ty ]

    # get number of time steps
    tn = y_current.size

    # iterator over timestep
    for i in range(1, tn):
       # draw samples
        sirModule_sample_draw(rng_states, particles[ty][i-1], beta, 
                                 omega, particles[ty][i])

        # get weight
        sirModule_weight(particles[ty][i], particles[ty][i-1], weight[ty][i-1], 
                            weight[ty][i], y_current[i], beta, omega, gamma)

        # normalize weight
        weight_sum = arr_sum(weight[ty][i])
        arr_div(weight[ty][i], weight_sum)

        # calculate tau
        sirModule_tau(particles[ty][i], beta, omega, phi, phi_sub)

        # update greater and equal
        greater[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi)
        equal[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi_sub)

def main():

    beta = 1
    omega = 1
    gamma = 2    

    pn = 100
    event_number = 50
    timestep = 100

    y = np.ones((event_number, timestep), dtype = np.int8)
    particles = cuda.to_device(np.zeros((event_number, timestep, pn), dtype = np.float32))
    weight = cuda.to_device(np.ones((event_number, timestep, pn), dtype = np.float32))
    greater = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
    equal = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))

    phi = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
    phi_sub = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))

    rng_states = create_xoroshiro128p_states(pn, seed=1)

    start = timer()
    SIR[event_number, pn](rng_states, y, particles, weight, beta, 
    omega, gamma, greater, equal, phi, phi_sub)

    vectoradd_time = timer() - start

    print("sirModule1 took %f seconds" % vectoradd_time)

if __name__ == '__main__':
    main()

然后我得到

numba.cuda.cudadrv.driver.CudaAPIError：[715]调用cuMemcpyDtoH导致UNKNOWN_CUDA_ERROR

numba.cuda.cudadrv.driver.CudaAPIError：[715]调用cuMemFree导致UNKNOWN_CUDA_ERROR

...错误

有人遇到同样的问题吗？我在网上查了一下，有些人认为这个问题来自WDDM TDR，但我认为只适用于Windows，对吗？

以下是代码中缺少的部分。

import numpy as np
import numba as nb
from timeit import default_timer as timer
from matplotlib import pyplot as pt
import math
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32

"""
Look up table for factorial
"""
LOOKUP_TABLE = cuda.to_device(np.array([
1, 1, 2, 6, 24, 120, 720, 5040, 40320,
362880, 3628800, 39916800, 479001600,
6227020800, 87178291200, 1307674368000,
20922789888000, 355687428096000, 6402373705728000,
121645100408832000, 2432902008176640000], dtype='int64'))


"""
arr_sum - sum element in array
"""
@cuda.jit(device=True)
def arr_sum(arr):
    result = 0
    for i in range(arr.size):
        result = result + arr[i]

    return result


"""
dot - dot product of arr1 and arr2
"""
@cuda.jit(device=True)
def dot(arr1, arr2):
    result = 0
    for i in range(arr1.size):
        result = arr1[i]*arr2[i] + result

    return result


"""
arr_div - divide element in array
"""
@cuda.jit(device=True)
def arr_div(arr, div):
    thread_id = cuda.threadIdx.x

    arr[thread_id] = arr[thread_id]/div

"""
SIR module (sample_draw) - module drawing sample for time t (rampling model)
"""
@cuda.jit(device=True)
def sirModule_sample_draw(rng_states, inp, beta, omega, out):
    """Find a value less than 1 from nomral distribution"""
    thread_id = cuda.threadIdx.x

    # draw candidate sample from normal distribution and store
    # when less than 1
    while True:
        candidate = inp[thread_id] + beta + omega * xoroshiro128p_normal_float32(rng_states, thread_id)

        if candidate < 1:
            out[thread_id] = candidate
            break


"""
SIR module (weight calculation) - weight calculation method
"""
@cuda.jit(device=True)
def sirModule_weight(current, previous, weight, out, y, beta, omega, gamma):
    thread_id = cuda.threadIdx.x
    PI = 3.14159265359

    # calculate the pdf/pmf of given state
    Z = ( current[thread_id] - ( previous[ thread_id ] + beta ) ) / omega
    p1_div_p3 = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )

    mu = math.log( 1 + math.exp( gamma * current[ thread_id ] ) )    
    p2 = math.exp( mu ) * mu**y / LOOKUP_TABLE[ y ]

    out[thread_id] = weight[thread_id]*p2*p1_div_p3


"""
SIR module (phi distribution calculator)
"""
@cuda.jit(device=True)
def sirModule_tau(current, beta, omega, phi, phi_sub):
thread_id = cuda.threadIdx.x

    # calculate phi distribution and subtract from 1
    Z = ( 1 - ( current[ thread_id ] + beta ) ) / omega
    phi[ thread_id ] = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
    phi_sub[ thread_id ] = 1 - phi[ thread_id ]

但这些是设备功能。这应该是问题的根源吗？

对于错误，我收到以下错误消息：我的代码中的第207行是我调用SIR模块的地方。

Traceback (most recent call last):
  File "CUDA_MonteCarlo_Testesr.py", line 214, in <module>
    main()
  File "CUDA_MonteCarlo_Testesr.py", line 207, in main
    omega, gamma, greater, equal, phi, phi_sub)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 703, in __call__
cfg(*args)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 483, in __call__
sharedmem=self.sharedmem)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 585, in _kernel_call
wb()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 600, in <lambda>
retr.append(lambda: devary.copy_to_host(val, stream=stream))
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/devicearray.py", line 198, in copy_to_host
_driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1597, in device_to_host
fn(host_pointer(dst), device_pointer(src), size, *varargs)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 288, in safe_cuda_api_call
self._check_error(fname, retcode)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 323, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [715] Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR
Traceback (most recent call last):
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 647, in _exitfunc
f()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 571, in __call__
return info.func(*info.args, **(info.kwargs or {}))
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1099, in deref
mem.free()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1013, in free
self._finalizer()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 571, in __call__
return info.func(*info.args, **(info.kwargs or {}))
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 863, in core
deallocations.add_item(dtor, handle, size=bytesize)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 519, in add_item
self.clear()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 530, in clear
dtor(handle)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 288, in safe_cuda_api_call
self._check_error(fname, retcode)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 323, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [715] Call to cuMemFree results in UNKNOWN_CUDA_ERROR

Answer 1

我认为可能有两个问题。

我不确定您在main之外使用LOOKUP_TABLE = cuda.to_device(是否有效。我想你正在尝试创建一个设备数组，但我认为你应该使用numba.cuda.device_array()。
您似乎没有正确地将阵列y转移到设备上以供使用。

当我进行这两项更改时，代码似乎在没有CUDA运行时错误的情况下运行：

# cat t1.py
import numpy as np
import numba as nb
from timeit import default_timer as timer
# from matplotlib import pyplot as pt
import math
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32

"""
Look up table for factorial
"""


"""
arr_sum - sum element in array
"""
@cuda.jit(device=True)
def arr_sum(arr):
    result = 0
    for i in range(arr.size):
        result = result + arr[i]

    return result


"""
dot - dot product of arr1 and arr2
"""
@cuda.jit(device=True)
def dot(arr1, arr2):
    result = 0
    for i in range(arr1.size):
        result = arr1[i]*arr2[i] + result

    return result


"""
arr_div - divide element in array
"""
@cuda.jit(device=True)
def arr_div(arr, div):
    thread_id = cuda.threadIdx.x

    arr[thread_id] = arr[thread_id]/div

"""
SIR module (sample_draw) - module drawing sample for time t (rampling model)
"""
@cuda.jit(device=True)
def sirModule_sample_draw(rng_states, inp, beta, omega, out):
    """Find a value less than 1 from nomral distribution"""
    thread_id = cuda.threadIdx.x

    # draw candidate sample from normal distribution and store
    # when less than 1
    while True:
        candidate = inp[thread_id] + beta + omega * xoroshiro128p_normal_float32(rng_states, thread_id)

        if candidate < 1:
            out[thread_id] = candidate
            break


"""
SIR module (weight calculation) - weight calculation method
"""
@cuda.jit(device=True)
def sirModule_weight(current, previous, weight, out, y, beta, omega, gamma, lt):
    thread_id = cuda.threadIdx.x
    PI = 3.14159265359

    # calculate the pdf/pmf of given state
    Z = ( current[thread_id] - ( previous[ thread_id ] + beta ) ) / omega
    p1_div_p3 = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )

    mu = math.log( 1 + math.exp( gamma * current[ thread_id ] ) )
    p2 =  math.exp( mu ) * mu**y /  lt[ y ]

    out[thread_id] = weight[thread_id]*p2*p1_div_p3


"""
SIR module (phi distribution calculator)
"""
@cuda.jit(device=True)
def sirModule_tau(current, beta, omega, phi, phi_sub):
    thread_id = cuda.threadIdx.x

    # calculate phi distribution and subtract from 1
    Z = ( 1 - ( current[ thread_id ] + beta ) ) / omega
    phi[ thread_id ] = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
    phi_sub[ thread_id ] = 1 - phi[ thread_id ]

@cuda.jit
def SIR(rng_states, y, particles, weight, beta, omega, gamma,
    greater, equal, phi, phi_sub, lt):
    # thread/block index for accessing data
    tx = cuda.threadIdx.x # Thread id in a 1D block = particle index
    ty = cuda.blockIdx.x # Block id in a 1D grid = event index
    bw = cuda.blockDim.x # Block width, i.e. number of threads per block = particle number
    pos = tx + ty * bw # computed flattened index inside the array

    # get current event y_t
    y_current = y[ ty ]

    # get number of time steps
    tn = y_current.size

    # iterator over timestep
    for i in range(1, tn):
       # draw samples
        sirModule_sample_draw(rng_states, particles[ty][i-1], beta,
                                 omega, particles[ty][i])

        # get weight
        sirModule_weight(particles[ty][i], particles[ty][i-1], weight[ty][i-1], weight[ty][i], y_current[i], beta, omega, gamma, lt)

        # normalize weight
        weight_sum = arr_sum(weight[ty][i])
        arr_div(weight[ty][i], weight_sum)

        # calculate tau
        sirModule_tau(particles[ty][i], beta, omega, phi, phi_sub)

        # update greater and equal
        greater[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi)
        equal[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi_sub)

def main():

    beta = 1
    omega = 1
    gamma = 2

    pn = 100
    event_number = 50
    timestep = 100


    LOOKUP_TABLE = cuda.to_device(np.array([
    1, 1, 2, 6, 24, 120, 720, 5040, 40320,
    362880, 3628800, 39916800, 479001600,
    6227020800, 87178291200, 1307674368000,
    20922789888000, 355687428096000, 6402373705728000,
    121645100408832000, 2432902008176640000], dtype='int64'))



    hy = np.ones((event_number, timestep), dtype = np.uint32)
    print(hy.size)
    print(hy)
    y = cuda.to_device(hy)
    particles = cuda.to_device(np.zeros((event_number, timestep, pn), dtype = np.float32))
    weight = cuda.to_device(np.ones((event_number, timestep, pn), dtype = np.float32))
    greater = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
    equal = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))

    phi = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
    phi_sub = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))

    rng_states = create_xoroshiro128p_states(pn, seed=1)

    start = timer()
    SIR[event_number, pn](rng_states, y, particles, weight, beta, omega, gamma, greater, equal, phi, phi_sub, LOOKUP_TABLE)

    vectoradd_time = timer() - start

    print("sirModule1 took %f seconds" % vectoradd_time)
    cuda.synchronize()
if __name__ == '__main__':
    main()

# cuda-memcheck python t1.py
========= CUDA-MEMCHECK
5000
[[1 1 1 ..., 1 1 1]
 [1 1 1 ..., 1 1 1]
 [1 1 1 ..., 1 1 1]
 ...,
 [1 1 1 ..., 1 1 1]
 [1 1 1 ..., 1 1 1]
 [1 1 1 ..., 1 1 1]]
sirModule1 took 0.840958 seconds
========= ERROR SUMMARY: 0 errors
#

Answer 2

解决了！我正在使用Ubuntu 16.04。首次安装Numba时，numba.cuda函数可以正常工作。但是后来我遇到了这类错误

提高CudaAPIError（retcode，msg）

CudaAPIError：调用cuMemcpyHtoD结果导致CUDA_ERROR_LAUNCH_FAILED

将系统置于“挂起”状态时会遇到这些错误。为了避免此类错误，请重新启动系统或不要挂起。

使用Numba在Python上发生CUDA API错误

2 个答案: