我迈出了OpenCL编码的第一步。我有一个框架,我知道至少可以从CPU中获取一个数组,在OpenCL中执行操作,然后读回数组(使用正确的答案)。我目前正试图通过添加this OpenCL example中的移位网格来改进这一点(幻灯片18-23;只有重大改进是我将VBO更改为float3而不是float4)。
我已经在这些幻灯片和this resource中设置了一个共享上下文。我用CPU输入数据测试了VBO(所以我知道它正确绘制)。另外,我在VBO之前创建了上下文(由this thread推动)。最后,我尝试将内核重新编写为以下[编辑]:
__kernel void sine_wave(__global float3* pos, int width, int height, float time) {
uint x = get_global_id(0); uint y = get_global_id(1);
pos[y*width+x] = (float3)(1.0f,1.0f,1.0f);
}
然而,无论我做什么,我都无法让OpenCL程序更新任何内容。没有错误,没有任何错误,但VBO仍然与输入数据相同。如果我没有指定输入数据,则所有点都在(0,0,0)处呈现。我无法弄清楚是什么原因引起的。
想法?谢谢,
伊恩
PS#1:目前的系统是NVIDIA GTX 580M,在Windows 7 x64上,尽管编写的代码是可移植的。
PS#2:如果没有任何线索,我可以提供代码。 。
答案 0 :(得分:1)
好吧,我明白了。经过几个小时的搜索,我下载了NVIDIA的GPU计算工具包,它似乎是链接演示所源自的地方。然后我将代码大大减少到下面的~220行源(可能对未来的编码员有帮助):
#pragma comment(lib,"Opengl32.lib")
#pragma comment(lib,"glu32.lib")
#pragma comment(lib,"OpenCL.lib")
#pragma comment(lib,"glew32.lib")
#pragma comment(lib,"glut32.lib")
// OpenGL Graphics Includes
#include <GL/glew.h>
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenGL/OpenGL.h>
#include <GLUT/glut.h>
#else
#include <GL/glut.h>
#ifdef UNIX
#include <GL/glx.h>
#endif
#endif
#include <CL/opencl.h>
// Rendering window vars
const unsigned int window_width = 512;
const unsigned int window_height = 512;
const unsigned int mesh_width = 256;
const unsigned int mesh_height = 256;
// OpenCL vars
cl_context cxGPUContext;
cl_device_id* cdDevices;
cl_command_queue cqCommandQueue;
cl_kernel ckKernel;
cl_mem vbo_cl;
cl_program cpProgram;
size_t szGlobalWorkSize[] = {mesh_width, mesh_height};
// vbo variables
GLuint vbo;
int mouse_old_x, mouse_old_y;
int mouse_buttons = 0;
float rotate_x = 0.0, rotate_y = 0.0;
float translate_z = -3.0;
void mouse(int button, int state, int x, int y) {
if (state == GLUT_DOWN) {
mouse_buttons |= 1<<button;
} else if (state == GLUT_UP) {
mouse_buttons = 0;
}
mouse_old_x = x;
mouse_old_y = y;
}
void motion(int x, int y) {
float dx, dy;
dx = (float)(x - mouse_old_x);
dy = (float)(y - mouse_old_y);
if (mouse_buttons & 1) {
rotate_x += dy * 0.2f;
rotate_y += dx * 0.2f;
} else if (mouse_buttons & 4) {
translate_z += dy * 0.01f;
}
mouse_old_x = x;
mouse_old_y = y;
}
void DisplayGL(void) {
static float anim = 0.0f;
// run OpenCL kernel to generate vertex positions
glFinish();
clEnqueueAcquireGLObjects(cqCommandQueue, 1, &vbo_cl, 0,0,0);
clSetKernelArg(ckKernel, 3, sizeof(float), &anim);
clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 2, NULL, szGlobalWorkSize, NULL, 0,0,0 );
clEnqueueReleaseGLObjects(cqCommandQueue, 1, &vbo_cl, 0,0,0);
clFinish(cqCommandQueue);
// set view matrix
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
glLoadIdentity();
glTranslatef(0.0, 0.0, translate_z);
glRotatef(rotate_x, 1.0, 0.0, 0.0);
glRotatef(rotate_y, 0.0, 1.0, 0.0);
glBindBuffer(GL_ARRAY_BUFFER, vbo);
glVertexPointer(4, GL_FLOAT, 0, 0);
glEnableClientState(GL_VERTEX_ARRAY);
glColor3f(1.0, 0.0, 0.0);
glDrawArrays(GL_POINTS, 0, mesh_width * mesh_height);
glDisableClientState(GL_VERTEX_ARRAY);
// flip backbuffer to screen
glutSwapBuffers();
anim += 0.01f;
}
void timerEvent(int value) {
glutPostRedisplay();
glutTimerFunc(10, timerEvent,0);
}
int main(int argc, char** argv) {
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE);
glutInitWindowPosition (glutGet(GLUT_SCREEN_WIDTH)/2 - window_width/2, glutGet(GLUT_SCREEN_HEIGHT)/2 - window_height/2);
glutInitWindowSize(window_width, window_height);
glutCreateWindow("OpenCL/GL Interop (VBO)");
glutDisplayFunc(DisplayGL);
glutMouseFunc(mouse);
glutMotionFunc(motion);
glutTimerFunc(10, timerEvent,0);
glewInit();
glClearColor(0.0, 0.0, 0.0, 1.0);
glDisable(GL_DEPTH_TEST);
glViewport(0, 0, window_width, window_height);
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluPerspective(60.0, (GLfloat)window_width / (GLfloat) window_height, 0.1, 10.0);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
//Get the NVIDIA platform
cl_platform_id cpPlatform;
clGetPlatformIDs(1,&cpPlatform,NULL);
// Get the number of GPU devices available to the platform
cl_uint uiDevCount;
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiDevCount);
// Create the device list
cdDevices = new cl_device_id [uiDevCount];
clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL);
// Define OS-specific context properties and create the OpenCL context
#if defined (__APPLE__)
CGLContextObj kCGLContext = CGLGetCurrentContext();
CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext);
cl_context_properties props[] =
{
CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup,
0
};
cxGPUContext = clCreateContext(props, 0,0, NULL, NULL, &ciErrNum);
#else
#ifdef UNIX
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR, (cl_context_properties)glXGetCurrentContext(),
CL_GLX_DISPLAY_KHR, (cl_context_properties)glXGetCurrentDisplay(),
CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
0
};
cxGPUContext = clCreateContext(props, 1, &cdDevices[uiDeviceUsed], NULL, NULL, &ciErrNum);
#else // Win32
cl_context_properties props[] =
{
CL_GL_CONTEXT_KHR, (cl_context_properties)wglGetCurrentContext(),
CL_WGL_HDC_KHR, (cl_context_properties)wglGetCurrentDC(),
CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
0
};
cxGPUContext = clCreateContext(props, 1, &cdDevices[0], NULL, NULL, NULL);
#endif
#endif
// create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[0], 0, NULL);
const char* cSourceCL = "__kernel void sine_wave(__global float4* pos, unsigned int width, unsigned int height, float time)\n"
"{\n"
" unsigned int x = get_global_id(0);\n"
" unsigned int y = get_global_id(1);\n"
"\n"
" // calculate uv coordinates\n"
" float u = x / (float) width;\n"
" float v = y / (float) height;\n"
" u = u*2.0f - 1.0f;\n"
" v = v*2.0f - 1.0f;\n"
"\n"
" // calculate simple sine wave pattern\n"
" float freq = 4.0f;\n"
" float w = sin(u*freq + time) * cos(v*freq + time) * 0.5f;\n"
"\n"
" // write output vertex\n"
" pos[y*width+x] = (float4)(u, w, v, 1.0f);\n"
"}\n";
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &cSourceCL, NULL, NULL);
clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
// create the kernel
ckKernel = clCreateKernel(cpProgram, "sine_wave", NULL);
// create VBO (if using standard GL or CL-GL interop), otherwise create Cl buffer
unsigned int size = mesh_width * mesh_height * 4 * sizeof(float);
glGenBuffers(1,&vbo);
glBindBuffer(GL_ARRAY_BUFFER,vbo);
// initialize buffer object
glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);
// create OpenCL buffer from GL VBO
vbo_cl = clCreateFromGLBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, vbo, NULL);
// set the args values
clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &vbo_cl);
clSetKernelArg(ckKernel, 1, sizeof(unsigned int), &mesh_width);
clSetKernelArg(ckKernel, 2, sizeof(unsigned int), &mesh_height);
glutMainLoop();
}
与原始代码进行比较后,我(最终)找到了关键区别。
右:
clEnqueueNDRangeKernel(context->command_queue, kernel->kernel, 2, NULL, global,NULL, 0,0,0 );
错:
clEnqueueNDRangeKernel(context->command_queue, kernel->kernel, 2, NULL, global,local, 0,0,0 );
事实证明,我使用的网格尺寸为10x10,比我在其他地方看到的示例要小,后者告诉我使用16x16作为“本地”。因为“全局”是网格大小,“全局”小于而不是“本地”。
由于某些原因,这并没有造成任何错误,但在这一点上,老实说我不能说我完全理解这些变量的用途。
伊恩