clEnqueueReadBuffer失败:资源不足

时间:2015-03-07 18:52:12

标签: python opencl gpgpu raytracing pyopencl

我在尝试使用pyopencl进行光线追踪程序时遇到了麻烦。

为了给你一些背景,这个程序背后的想法是我在网格中有一组正面的光子,它们只在z分量上有动量。他们通过具有空间相关折射率的镜头。在几个时间间隔后,我尝试用光子的位置构建一个histrogram

为此,我使用了numpy和pyopencl。我首先将光子排列成这种格式的数组:[[X0,Y0,Z0,Kx0,KY0,KZ0],...,[XN,YN,ZN,KxN,KYN,KZN]]其中0 - &gt ; N是光子的指数。然后我将这个numpy数组传递给设备进行计算,这是一个简单的线性系统与Runge Kutta第四阶的整合。

我的代码尝试如下:

Python Main

# -*- coding: utf-8 -*-

import numpy as np
import pyopencl as cl
from pylab import *


if __name__ == "__main__":

# Photons grid
Lx = 1.0
Ly = 1.0
delta_x = 0.01
delta_y = 0.01
Nx  = Lx / delta_x
Ny  = Ly / delta_y

X,Y = np.mgrid[-Lx:Lx+delta_x:delta_x,-Ly:Ly+delta_y:delta_y]
X = X.ravel()
Y = Y.ravel()
Z = -10.0

Kx = 0.0
Ky = 0.0
Kz = 1.0

N_points = len(X)

# Arranging the initial photons on a grid to send to Device in form of [[X0,Y0, ...], [X1,Y1, ...], ....]
grid_h = []
for i in range(N_points):
    grid_h.append(np.array([X[i], Y[i], Z, Kx, Ky, Kz], dtype=np.float32))
grid_h = array(grid_h)


# Time Parameters
Lt = np.float32(1.0)  # Time interval between departure of the photons and arrival at the "screen"
delta_t = np.float32(0.1)  # Time step
T = np.arange(0.0, Lt, delta_t)


# Device Init
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
MF = cl.mem_flags

# CL Source Read and Build
f = open("kernel_tracer_2.cl", "r")
Source = f.read()
prg = cl.Program(ctx, Source).build()

# Constant init
N0 = np.float32(1.0)
Nm = np.float32(0.05)
C0 = np.float32(3.0)
DR = np.float32(0.2)

# Execution; Sends Blocks of photons to device, evolves them step by step
grid_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=grid_h)
for t in T:

    # Evolve every photon
    time = np.float32(t)
    prg.RK4Step(queue, grid_h.shape, None, time, delta_t, N0, Nm, C0, DR, grid_d)

cl.enqueue_copy(queue, grid_h, grid_d)
X = grid_h[:, 0]
Y = grid_h[:, 1]

# Plotting
figure(1)
plt.plot(X,Y,'.')

figure(2)
H, xedges, yedges = histogram2d(X,Y, bins=50)
fig = plt.figure(figsize=(7, 3))
ax = fig.add_subplot(131)
ax.set_title('imshow: equidistant')
im = plt.imshow(H, interpolation='nearest', origin='low',
                extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]])

show()

设备代码 (kernel_tracer_2.cl)

float8 f(float t, float N0, float Nm, float C0, float DR,
     float8 q)
{ 



const float ind = N0 + Nm * exp(-(q.s0*q.s0 + q.s1*q.s1 + q.s2*q.s2)/(DR*DR));
const float np = sqrt(q.s3*q.s3 + q.s4*q.s4 + q.s5*q.s5);
const float c1 = C0 / (ind * np);
const float c2 = -2.0 * C0 * Nm *exp(-(q.s0*q.s0 + q.s1*q.s1 + q.s2*q.s2)/(DR*DR)) / (ind * ind * DR * DR);
float8 v;

v.s0 = c1 * q.s3;
v.s1 = c1 * q.s4;
v.s2 = c1 * q.s5;


v.s3 = c2 * q.s0;
v.s4 = c2 * q.s1;
v.s5 = c2 * q.s2;

return v;
}


__kernel void RK4Step(float t, float dt, float N0, float Nm, float C0, float DR, __global float8 *p){
const int gid = get_global_id(0);
float8 k, qm,qs;


//k1
k = f(t, N0, Nm ,C0, DR, p[gid]);
qs = p[gid] + dt * k/6.0;
qm = p[gid] + 0.5 * dt * k;

//k2
k = f(t+0.5*dt, N0, Nm ,C0, DR, qm);
qs +=  dt * k/3.0;
qm = p[gid] + 0.5 * dt * k;

//k3
k = f(t+0.5*dt, N0, Nm ,C0, DR, qm);
qs +=  dt * k/3.0;
qm = p[gid] + dt * k;

//k4
k = f(t + dt, N0, Nm ,C0,DR, qm);
qs +=  dt * k/6.0;

//update photon
p[gid] = qs;
}

每当我运行代码时,我都会收到以下错误:

Traceback (most recent call last):
File "path here", line 73, in <module>
cl.enqueue_copy(queue, grid_h, grid_d)
File "C:\Python27\lib\site-packages\pyopencl-2014.1-py2.7-win-       amd64.egg\pyopencl\__init__.py", line 1090, in enqueue_copy
return _cl._enqueue_read_buffer(queue, src, dest, **kwargs)
RuntimeError: clEnqueueReadBuffer failed: out of resources

第73行是:cl.enqueue_copy(queue,grid_h,grid_d) 那时我试图将计算结果复制回设备。

现在的事情是。我的GTX970上出现此错误。我在Titan中尝试了这个代码,它运行正常,直到我将delta_x和delta_y更改为低于0.01的值。另一方面,我尝试使用旧的Mobility Radeon HD 5145在我的笔记本电脑上运行代码并运行良好。我知道它是一个不同的SDK,但这让我感到困惑。 我还尝试将delta_x和delta_y都更改为0.1只是为了测试它,并且我没有错误,但结果都是错误的,与Titan和我的笔记本电脑的结果相比。

我有来自nvidia的最新驱动程序和sdk。

我很抱歉这篇文章很长,但我一直试图解决这个问题,并且不知道是什么导致了这一点。我试图发布我认为相关的所有信息。

1 个答案:

答案 0 :(得分:1)

我不认为这是你的问题,但从这看起来,你应该等到你的RK4步骤完成才能复制你的结果:

prg.RK4Step(queue, grid_h.shape, None, time, delta_t, N0, Nm, C0, DR, grid_d)
cl.enqueue_copy(queue, grid_h, grid_d)

取而代之的是

completeEvent = prg.RK4Step(queue, grid_h.shape, None, time, delta_t, N0, Nm, C0, DR, grid_d)
events = [ completeEvent ];
cl.enqueue_copy(queue, grid_h, grid_d, wait_for=events);

或致电completeEvent.wait()。您的内核调用没有阻塞,因此无法保证完成,但您将结果复制出来。这可能导致有趣(不正确)的行为。