我是numba的新手,并试图加快我的monte carlo方法。我目前正在使用GeForce 950M在Ubuntu 14.04上工作。 CUDA版本是8.0.61。
当我尝试运行以下代码时,我从CUDA API获得了一些与内存相关的错误
代码:
@cuda.jit
def SIR(rng_states, y, particles, weight, beta, omega, gamma,
greater, equal, phi, phi_sub):
# thread/block index for accessing data
tx = cuda.threadIdx.x # Thread id in a 1D block = particle index
ty = cuda.blockIdx.x # Block id in a 1D grid = event index
bw = cuda.blockDim.x # Block width, i.e. number of threads per block = particle number
pos = tx + ty * bw # computed flattened index inside the array
# get current event y_t
y_current = y[ ty ]
# get number of time steps
tn = y_current.size
# iterator over timestep
for i in range(1, tn):
# draw samples
sirModule_sample_draw(rng_states, particles[ty][i-1], beta,
omega, particles[ty][i])
# get weight
sirModule_weight(particles[ty][i], particles[ty][i-1], weight[ty][i-1],
weight[ty][i], y_current[i], beta, omega, gamma)
# normalize weight
weight_sum = arr_sum(weight[ty][i])
arr_div(weight[ty][i], weight_sum)
# calculate tau
sirModule_tau(particles[ty][i], beta, omega, phi, phi_sub)
# update greater and equal
greater[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi)
equal[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi_sub)
def main():
beta = 1
omega = 1
gamma = 2
pn = 100
event_number = 50
timestep = 100
y = np.ones((event_number, timestep), dtype = np.int8)
particles = cuda.to_device(np.zeros((event_number, timestep, pn), dtype = np.float32))
weight = cuda.to_device(np.ones((event_number, timestep, pn), dtype = np.float32))
greater = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
equal = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
phi = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
phi_sub = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
rng_states = create_xoroshiro128p_states(pn, seed=1)
start = timer()
SIR[event_number, pn](rng_states, y, particles, weight, beta,
omega, gamma, greater, equal, phi, phi_sub)
vectoradd_time = timer() - start
print("sirModule1 took %f seconds" % vectoradd_time)
if __name__ == '__main__':
main()
然后我得到
numba.cuda.cudadrv.driver.CudaAPIError:[715]调用cuMemcpyDtoH导致UNKNOWN_CUDA_ERROR
numba.cuda.cudadrv.driver.CudaAPIError:[715]调用cuMemFree导致UNKNOWN_CUDA_ERROR
...错误
有人遇到同样的问题吗?我在网上查了一下,有些人认为这个问题来自WDDM TDR,但我认为只适用于Windows,对吗?
以下是代码中缺少的部分。
import numpy as np
import numba as nb
from timeit import default_timer as timer
from matplotlib import pyplot as pt
import math
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32
"""
Look up table for factorial
"""
LOOKUP_TABLE = cuda.to_device(np.array([
1, 1, 2, 6, 24, 120, 720, 5040, 40320,
362880, 3628800, 39916800, 479001600,
6227020800, 87178291200, 1307674368000,
20922789888000, 355687428096000, 6402373705728000,
121645100408832000, 2432902008176640000], dtype='int64'))
"""
arr_sum - sum element in array
"""
@cuda.jit(device=True)
def arr_sum(arr):
result = 0
for i in range(arr.size):
result = result + arr[i]
return result
"""
dot - dot product of arr1 and arr2
"""
@cuda.jit(device=True)
def dot(arr1, arr2):
result = 0
for i in range(arr1.size):
result = arr1[i]*arr2[i] + result
return result
"""
arr_div - divide element in array
"""
@cuda.jit(device=True)
def arr_div(arr, div):
thread_id = cuda.threadIdx.x
arr[thread_id] = arr[thread_id]/div
"""
SIR module (sample_draw) - module drawing sample for time t (rampling model)
"""
@cuda.jit(device=True)
def sirModule_sample_draw(rng_states, inp, beta, omega, out):
"""Find a value less than 1 from nomral distribution"""
thread_id = cuda.threadIdx.x
# draw candidate sample from normal distribution and store
# when less than 1
while True:
candidate = inp[thread_id] + beta + omega * xoroshiro128p_normal_float32(rng_states, thread_id)
if candidate < 1:
out[thread_id] = candidate
break
"""
SIR module (weight calculation) - weight calculation method
"""
@cuda.jit(device=True)
def sirModule_weight(current, previous, weight, out, y, beta, omega, gamma):
thread_id = cuda.threadIdx.x
PI = 3.14159265359
# calculate the pdf/pmf of given state
Z = ( current[thread_id] - ( previous[ thread_id ] + beta ) ) / omega
p1_div_p3 = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
mu = math.log( 1 + math.exp( gamma * current[ thread_id ] ) )
p2 = math.exp( mu ) * mu**y / LOOKUP_TABLE[ y ]
out[thread_id] = weight[thread_id]*p2*p1_div_p3
"""
SIR module (phi distribution calculator)
"""
@cuda.jit(device=True)
def sirModule_tau(current, beta, omega, phi, phi_sub):
thread_id = cuda.threadIdx.x
# calculate phi distribution and subtract from 1
Z = ( 1 - ( current[ thread_id ] + beta ) ) / omega
phi[ thread_id ] = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
phi_sub[ thread_id ] = 1 - phi[ thread_id ]
但这些是设备功能。这应该是问题的根源吗?
对于错误,我收到以下错误消息:我的代码中的第207行是我调用SIR模块的地方。
Traceback (most recent call last):
File "CUDA_MonteCarlo_Testesr.py", line 214, in <module>
main()
File "CUDA_MonteCarlo_Testesr.py", line 207, in main
omega, gamma, greater, equal, phi, phi_sub)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 703, in __call__
cfg(*args)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 483, in __call__
sharedmem=self.sharedmem)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 585, in _kernel_call
wb()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/compiler.py", line 600, in <lambda>
retr.append(lambda: devary.copy_to_host(val, stream=stream))
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/devicearray.py", line 198, in copy_to_host
_driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1597, in device_to_host
fn(host_pointer(dst), device_pointer(src), size, *varargs)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 288, in safe_cuda_api_call
self._check_error(fname, retcode)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 323, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [715] Call to cuMemcpyDtoH results in UNKNOWN_CUDA_ERROR
Traceback (most recent call last):
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 647, in _exitfunc
f()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 571, in __call__
return info.func(*info.args, **(info.kwargs or {}))
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1099, in deref
mem.free()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 1013, in free
self._finalizer()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/utils.py", line 571, in __call__
return info.func(*info.args, **(info.kwargs or {}))
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 863, in core
deallocations.add_item(dtor, handle, size=bytesize)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 519, in add_item
self.clear()
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 530, in clear
dtor(handle)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 288, in safe_cuda_api_call
self._check_error(fname, retcode)
File "/home/ryan/anaconda3/envs/py53/lib/python3.5/site-packages/numba/cuda/cudadrv/driver.py", line 323, in _check_error
raise CudaAPIError(retcode, msg)
numba.cuda.cudadrv.driver.CudaAPIError: [715] Call to cuMemFree results in UNKNOWN_CUDA_ERROR
答案 0 :(得分:1)
我认为可能有两个问题。
我不确定您在main之外使用LOOKUP_TABLE = cuda.to_device(
是否有效。我想你正在尝试创建一个设备数组,但我认为你应该使用numba.cuda.device_array()
。
您似乎没有正确地将阵列y
转移到设备上以供使用。
当我进行这两项更改时,代码似乎在没有CUDA运行时错误的情况下运行:
# cat t1.py
import numpy as np
import numba as nb
from timeit import default_timer as timer
# from matplotlib import pyplot as pt
import math
from numba import cuda
from numba.cuda.random import create_xoroshiro128p_states, xoroshiro128p_normal_float32
"""
Look up table for factorial
"""
"""
arr_sum - sum element in array
"""
@cuda.jit(device=True)
def arr_sum(arr):
result = 0
for i in range(arr.size):
result = result + arr[i]
return result
"""
dot - dot product of arr1 and arr2
"""
@cuda.jit(device=True)
def dot(arr1, arr2):
result = 0
for i in range(arr1.size):
result = arr1[i]*arr2[i] + result
return result
"""
arr_div - divide element in array
"""
@cuda.jit(device=True)
def arr_div(arr, div):
thread_id = cuda.threadIdx.x
arr[thread_id] = arr[thread_id]/div
"""
SIR module (sample_draw) - module drawing sample for time t (rampling model)
"""
@cuda.jit(device=True)
def sirModule_sample_draw(rng_states, inp, beta, omega, out):
"""Find a value less than 1 from nomral distribution"""
thread_id = cuda.threadIdx.x
# draw candidate sample from normal distribution and store
# when less than 1
while True:
candidate = inp[thread_id] + beta + omega * xoroshiro128p_normal_float32(rng_states, thread_id)
if candidate < 1:
out[thread_id] = candidate
break
"""
SIR module (weight calculation) - weight calculation method
"""
@cuda.jit(device=True)
def sirModule_weight(current, previous, weight, out, y, beta, omega, gamma, lt):
thread_id = cuda.threadIdx.x
PI = 3.14159265359
# calculate the pdf/pmf of given state
Z = ( current[thread_id] - ( previous[ thread_id ] + beta ) ) / omega
p1_div_p3 = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
mu = math.log( 1 + math.exp( gamma * current[ thread_id ] ) )
p2 = math.exp( mu ) * mu**y / lt[ y ]
out[thread_id] = weight[thread_id]*p2*p1_div_p3
"""
SIR module (phi distribution calculator)
"""
@cuda.jit(device=True)
def sirModule_tau(current, beta, omega, phi, phi_sub):
thread_id = cuda.threadIdx.x
# calculate phi distribution and subtract from 1
Z = ( 1 - ( current[ thread_id ] + beta ) ) / omega
phi[ thread_id ] = 1.0 / 2.0 * ( 1.0 + math.erf( Z ) )
phi_sub[ thread_id ] = 1 - phi[ thread_id ]
@cuda.jit
def SIR(rng_states, y, particles, weight, beta, omega, gamma,
greater, equal, phi, phi_sub, lt):
# thread/block index for accessing data
tx = cuda.threadIdx.x # Thread id in a 1D block = particle index
ty = cuda.blockIdx.x # Block id in a 1D grid = event index
bw = cuda.blockDim.x # Block width, i.e. number of threads per block = particle number
pos = tx + ty * bw # computed flattened index inside the array
# get current event y_t
y_current = y[ ty ]
# get number of time steps
tn = y_current.size
# iterator over timestep
for i in range(1, tn):
# draw samples
sirModule_sample_draw(rng_states, particles[ty][i-1], beta,
omega, particles[ty][i])
# get weight
sirModule_weight(particles[ty][i], particles[ty][i-1], weight[ty][i-1], weight[ty][i], y_current[i], beta, omega, gamma, lt)
# normalize weight
weight_sum = arr_sum(weight[ty][i])
arr_div(weight[ty][i], weight_sum)
# calculate tau
sirModule_tau(particles[ty][i], beta, omega, phi, phi_sub)
# update greater and equal
greater[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi)
equal[ty][i] = greater[ty][i-1]*dot(weight[ty][i-1], phi_sub)
def main():
beta = 1
omega = 1
gamma = 2
pn = 100
event_number = 50
timestep = 100
LOOKUP_TABLE = cuda.to_device(np.array([
1, 1, 2, 6, 24, 120, 720, 5040, 40320,
362880, 3628800, 39916800, 479001600,
6227020800, 87178291200, 1307674368000,
20922789888000, 355687428096000, 6402373705728000,
121645100408832000, 2432902008176640000], dtype='int64'))
hy = np.ones((event_number, timestep), dtype = np.uint32)
print(hy.size)
print(hy)
y = cuda.to_device(hy)
particles = cuda.to_device(np.zeros((event_number, timestep, pn), dtype = np.float32))
weight = cuda.to_device(np.ones((event_number, timestep, pn), dtype = np.float32))
greater = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
equal = cuda.to_device(np.ones((event_number, timestep), dtype = np.float32))
phi = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
phi_sub = cuda.to_device(np.zeros(particles[0][0].size, dtype = np.float32))
rng_states = create_xoroshiro128p_states(pn, seed=1)
start = timer()
SIR[event_number, pn](rng_states, y, particles, weight, beta, omega, gamma, greater, equal, phi, phi_sub, LOOKUP_TABLE)
vectoradd_time = timer() - start
print("sirModule1 took %f seconds" % vectoradd_time)
cuda.synchronize()
if __name__ == '__main__':
main()
# cuda-memcheck python t1.py
========= CUDA-MEMCHECK
5000
[[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]
...,
[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]
[1 1 1 ..., 1 1 1]]
sirModule1 took 0.840958 seconds
========= ERROR SUMMARY: 0 errors
#
答案 1 :(得分:-1)
解决了!我正在使用Ubuntu 16.04。首次安装Numba时,numba.cuda函数可以正常工作。但是后来我遇到了这类错误
提高CudaAPIError(retcode,msg)
CudaAPIError:调用cuMemcpyHtoD结果导致CUDA_ERROR_LAUNCH_FAILED
将系统置于“挂起”状态时会遇到这些错误。为了避免此类错误,请重新启动系统或不要挂起。