使用Numba在Python上发生CUDA API错误

时间:2017-09-02 20:22:13

标签: python cuda numba

我是numba的新手,并试图加快我的monte carlo方法。我目前正在使用GeForce 950M在Ubuntu 14.04上工作。 CUDA版本是8.0.61。

当我尝试运行以下代码时,我从CUDA API获得了一些与内存相关的错误

代码:

@cuda.jit
def SIR(rng_states, y, particles, weight, beta, omega, gamma, 
    greater, equal, phi, phi_sub):
    # thread/block index for accessing data
    tx = cuda.threadIdx.x # Thread id in a 1D block = particle index
    ty = cuda.blockIdx.x # Block id in a 1D grid = event index
    bw = cuda.blockDim.x # Block width, i.e. number of threads per block = particle number
    pos = tx + ty * bw # computed flattened index inside the array

    # get current event y_t
    y_current = y[ ty ]

    # get number of time steps
    tn = y_current.size

    # iterator over timestep
    for i in range(1, tn):
       # draw samples
        sirModule_sample_draw(rng_states, particles[ty][i-1], beta, 
                                 omega, particles[ty][i])

        # get weight
        sirModule_weight(particles[ty][i], particles[ty][i-1], weight[ty][i-1], 
                            weight[ty][i], y_current[i], beta, omega, gamma)

        # normalize weight
        weight_sum = arr_sum(weight[ty][i])
        arr_div(weight[ty][i], weight_sum)

        # calculate tau
        sirModule_tau(particles[ty][i], beta, omega, phi, phi_sub)

        # update greater and equal
        greater[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi)
        equal[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi_sub)

def main():

    beta = 1
    omega = 1
    gamma = 2    

    pn = 100
    event_number = 50
    timestep = 100

    y = np.ones((event_number, timestep), dtype = np.int8)
    particles = cuda.to_device(np.zeros((event_number, timestep, pn), dtype = np.float32))
    weight = cuda.to_device(np.ones((event_number, timestep, pn), dtype = np.float32))
    greater = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
    equal = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))

    phi = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
    phi_sub = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))

    rng_states = create_xoroshiro128p_states(pn, seed=1)

    start = timer()
    SIR[event_number, pn](rng_states, y, particles, weight, beta, 
    omega, gamma, greater, equal, phi, phi_sub)

    vectoradd_time = timer() - start

    print("sirModule1 took %f seconds" % vectoradd_time)

if __name__ == '__main__':
    main()

然后我得到

numba.cuda.cudadrv.driver.CudaAPIError:[715]调用cuMemcpyDtoH导致UNKNOWN_CUDA_ERROR

numba.cuda.cudadrv.driver.CudaAPIError:[715]调用cuMemFree导致UNKNOWN_CUDA_ERROR

...错误

有人遇到同样的问题吗?我在网上查了一下,有些人认为这个问题来自WDDM TDR,但我认为只适用于Windows,对吗?

以下是代码中缺少的部分。

import numpy as np
import numba as nb
from timeit import default_timer as timer
from matplotlib import pyplot as pt
import math
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32

"""
Look up table for factorial
"""
LOOKUP_TABLE = cuda.to_device(np.array([
1, 1, 2, 6, 24, 120, 720, 5040, 40320,
362880, 3628800, 39916800, 479001600,
6227020800, 87178291200, 1307674368000,
20922789888000, 355687428096000, 6402373705728000,
121645100408832000, 2432902008176640000], dtype='int64'))


"""
arr_sum - sum element in array
"""
@cuda.jit(device=True)
def arr_sum(arr):
    result = 0
    for i in range(arr.size):
        result = result + arr[i]

    return result


"""
dot - dot product of arr1 and arr2
"""
@cuda.jit(device=True)
def dot(arr1, arr2):
    result = 0
    for i in range(arr1.size):
        result = arr1[i]*arr2[i] + result

    return result


"""
arr_div - divide element in array
"""
@cuda.jit(device=True)
def arr_div(arr, div):
    thread_id = cuda.threadIdx.x

    arr[thread_id] = arr[thread_id]/div

"""
SIR module (sample_draw) - module drawing sample for time t (rampling model)
"""
@cuda.jit(device=True)
def sirModule_sample_draw(rng_states, inp, beta, omega, out):
    """Find a value less than 1 from nomral distribution"""
    thread_id = cuda.threadIdx.x

    # draw candidate sample from normal distribution and store
    # when less than 1
    while True:
        candidate = inp[thread_id] + beta + omega * xoroshiro128p_normal_float32(rng_states, thread_id)

        if candidate < 1:
            out[thread_id] = candidate
            break


"""
SIR module (weight calculation) - weight calculation method
"""
@cuda.jit(device=True)
def sirModule_weight(current, previous, weight, out, y, beta, omega, gamma):
    thread_id = cuda.threadIdx.x
    PI = 3.14159265359

    # calculate the pdf/pmf of given state
    Z = ( current[thread_id] - ( previous[ thread_id ] + beta ) ) / omega
    p1_div_p3 = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )

    mu = math.log( 1 + math.exp( gamma * current[ thread_id ] ) )    
    p2 = math.exp( mu ) * mu**y / LOOKUP_TABLE[ y ]

    out[thread_id] = weight[thread_id]*p2*p1_div_p3


"""
SIR module (phi distribution calculator)
"""
@cuda.jit(device=True)
def sirModule_tau(current, beta, omega, phi, phi_sub):
thread_id = cuda.threadIdx.x

    # calculate phi distribution and subtract from 1
    Z = ( 1 - ( current[ thread_id ] + beta ) ) / omega
    phi[ thread_id ] = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
    phi_sub[ thread_id ] = 1 - phi[ thread_id ]

但这些是设备功能。这应该是问题的根源吗?

对于错误,我收到以下错误消息:我的代码中的第207行是我调用SIR模块的地方。

Traceback (most recent call last):
  File "CUDA_MonteCarlo_Testesr.py", line 214, in <module>
    main()
  File "CUDA_MonteCarlo_Testesr.py", line 207, in main
    omega, gamma, greater, equal, phi, phi_sub)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 703, in __call__
cfg(*args)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 483, in __call__
sharedmem=self.sharedmem)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 585, in _kernel_call
wb()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 600, in <lambda>
retr.append(lambda: devary.copy_to_host(val, stream=stream))
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/devicearray.py", line 198, in copy_to_host
_driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1597, in device_to_host
fn(host_pointer(dst), device_pointer(src), size, *varargs)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 288, in safe_cuda_api_call
self._check_error(fname, retcode)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 323, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [715] Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR
Traceback (most recent call last):
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 647, in _exitfunc
f()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 571, in __call__
return info.func(*info.args, **(info.kwargs or {}))
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1099, in deref
mem.free()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1013, in free
self._finalizer()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 571, in __call__
return info.func(*info.args, **(info.kwargs or {}))
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 863, in core
deallocations.add_item(dtor, handle, size=bytesize)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 519, in add_item
self.clear()
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 530, in clear
dtor(handle)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 288, in safe_cuda_api_call
self._check_error(fname, retcode)
  File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 323, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [715] Call to cuMemFree results in UNKNOWN_CUDA_ERROR

2 个答案:

答案 0 :(得分:1)

我认为可能有两个问题。

  1. 我不确定您在main之外使用LOOKUP_TABLE = cuda.to_device(是否有效。我想你正在尝试创建一个设备数组,但我认为你应该使用numba.cuda.device_array()

  2. 您似乎没有正确地将阵列y转移到设备上以供使用。

  3. 当我进行这两项更改时,代码似乎在没有CUDA运行时错误的情况下运行:

    # cat t1.py
    import numpy as np
    import numba as nb
    from timeit import default_timer as timer
    # from matplotlib import pyplot as pt
    import math
    from numba import cuda
    from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32
    
    """
    Look up table for factorial
    """
    
    
    """
    arr_sum - sum element in array
    """
    @cuda.jit(device=True)
    def arr_sum(arr):
        result = 0
        for i in range(arr.size):
            result = result + arr[i]
    
        return result
    
    
    """
    dot - dot product of arr1 and arr2
    """
    @cuda.jit(device=True)
    def dot(arr1, arr2):
        result = 0
        for i in range(arr1.size):
            result = arr1[i]*arr2[i] + result
    
        return result
    
    
    """
    arr_div - divide element in array
    """
    @cuda.jit(device=True)
    def arr_div(arr, div):
        thread_id = cuda.threadIdx.x
    
        arr[thread_id] = arr[thread_id]/div
    
    """
    SIR module (sample_draw) - module drawing sample for time t (rampling model)
    """
    @cuda.jit(device=True)
    def sirModule_sample_draw(rng_states, inp, beta, omega, out):
        """Find a value less than 1 from nomral distribution"""
        thread_id = cuda.threadIdx.x
    
        # draw candidate sample from normal distribution and store
        # when less than 1
        while True:
            candidate = inp[thread_id] + beta + omega * xoroshiro128p_normal_float32(rng_states, thread_id)
    
            if candidate < 1:
                out[thread_id] = candidate
                break
    
    
    """
    SIR module (weight calculation) - weight calculation method
    """
    @cuda.jit(device=True)
    def sirModule_weight(current, previous, weight, out, y, beta, omega, gamma, lt):
        thread_id = cuda.threadIdx.x
        PI = 3.14159265359
    
        # calculate the pdf/pmf of given state
        Z = ( current[thread_id] - ( previous[ thread_id ] + beta ) ) / omega
        p1_div_p3 = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
    
        mu = math.log( 1 + math.exp( gamma * current[ thread_id ] ) )
        p2 =  math.exp( mu ) * mu**y /  lt[ y ]
    
        out[thread_id] = weight[thread_id]*p2*p1_div_p3
    
    
    """
    SIR module (phi distribution calculator)
    """
    @cuda.jit(device=True)
    def sirModule_tau(current, beta, omega, phi, phi_sub):
        thread_id = cuda.threadIdx.x
    
        # calculate phi distribution and subtract from 1
        Z = ( 1 - ( current[ thread_id ] + beta ) ) / omega
        phi[ thread_id ] = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
        phi_sub[ thread_id ] = 1 - phi[ thread_id ]
    
    @cuda.jit
    def SIR(rng_states, y, particles, weight, beta, omega, gamma,
        greater, equal, phi, phi_sub, lt):
        # thread/block index for accessing data
        tx = cuda.threadIdx.x # Thread id in a 1D block = particle index
        ty = cuda.blockIdx.x # Block id in a 1D grid = event index
        bw = cuda.blockDim.x # Block width, i.e. number of threads per block = particle number
        pos = tx + ty * bw # computed flattened index inside the array
    
        # get current event y_t
        y_current = y[ ty ]
    
        # get number of time steps
        tn = y_current.size
    
        # iterator over timestep
        for i in range(1, tn):
           # draw samples
            sirModule_sample_draw(rng_states, particles[ty][i-1], beta,
                                     omega, particles[ty][i])
    
            # get weight
            sirModule_weight(particles[ty][i], particles[ty][i-1], weight[ty][i-1], weight[ty][i], y_current[i], beta, omega, gamma, lt)
    
            # normalize weight
            weight_sum = arr_sum(weight[ty][i])
            arr_div(weight[ty][i], weight_sum)
    
            # calculate tau
            sirModule_tau(particles[ty][i], beta, omega, phi, phi_sub)
    
            # update greater and equal
            greater[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi)
            equal[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi_sub)
    
    def main():
    
        beta = 1
        omega = 1
        gamma = 2
    
        pn = 100
        event_number = 50
        timestep = 100
    
    
        LOOKUP_TABLE = cuda.to_device(np.array([
        1, 1, 2, 6, 24, 120, 720, 5040, 40320,
        362880, 3628800, 39916800, 479001600,
        6227020800, 87178291200, 1307674368000,
        20922789888000, 355687428096000, 6402373705728000,
        121645100408832000, 2432902008176640000], dtype='int64'))
    
    
    
        hy = np.ones((event_number, timestep), dtype = np.uint32)
        print(hy.size)
        print(hy)
        y = cuda.to_device(hy)
        particles = cuda.to_device(np.zeros((event_number, timestep, pn), dtype = np.float32))
        weight = cuda.to_device(np.ones((event_number, timestep, pn), dtype = np.float32))
        greater = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
        equal = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
    
        phi = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
        phi_sub = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
    
        rng_states = create_xoroshiro128p_states(pn, seed=1)
    
        start = timer()
        SIR[event_number, pn](rng_states, y, particles, weight, beta, omega, gamma, greater, equal, phi, phi_sub, LOOKUP_TABLE)
    
        vectoradd_time = timer() - start
    
        print("sirModule1 took %f seconds" % vectoradd_time)
        cuda.synchronize()
    if __name__ == '__main__':
        main()
    
    # cuda-memcheck python t1.py
    ========= CUDA-MEMCHECK
    5000
    [[1 1 1 ..., 1 1 1]
     [1 1 1 ..., 1 1 1]
     [1 1 1 ..., 1 1 1]
     ...,
     [1 1 1 ..., 1 1 1]
     [1 1 1 ..., 1 1 1]
     [1 1 1 ..., 1 1 1]]
    sirModule1 took 0.840958 seconds
    ========= ERROR SUMMARY: 0 errors
    #
    

答案 1 :(得分:-1)

解决了!我正在使用Ubuntu 16.04。首次安装Numba时,numba.cuda函数可以正常工作。但是后来我遇到了这类错误

提高CudaAPIError(retcode,msg)

CudaAPIError:调用cuMemcpyHtoD结果导致CUDA_ERROR_LAUNCH_FAILED

将系统置于“挂起”状态时会遇到这些错误。为了避免此类错误,请重新启动系统或不要挂起。