我在里面死了一点。我一整天都在努力,但无济于事。我在运行一些以前运行良好的代码时出现问题,所以我写了一个简短的玩具" OpenCL程序试图弄清楚发生了什么,但我的玩具程序让我感到困惑和令人难以置信的沮丧。
我正在使用具有3Gb全局内存的Nvidia 780i。它的最大分配量约为780 Mb。首先,当我故意过度分配时,它不会出错。解决了(它是印刷的,但编译器/分析器并没有抓住它)。现在,即使尝试在设备应该能够处理的范围之下分配WAY,我在第二个大缓冲区分配上得到-6(CL_OUT_OF_HOST_MEMORY)的错误代码。
我一直在研究这个错误,我无法跟踪它在这种情况下的应用方式。在我使用的机器上有32 gb的内存,所以那里肯定不缺。我认为那里有一些我不理解的事情。
它将分配第一个缓冲区,但随后会阻塞第二个缓冲区。我基本上只能分配几乎我想要和需要的全局内存量。
非常感谢任何帮助。如果你可以帮我解决这个问题而且你在洛杉矶附近,我会带你出去喝酒。那是多么沮丧。
我的机器的代码和输出如下。
谢谢, 约翰
主程序:
#define _CRT_SECURE_NO_WARNINGS
#define PROGRAM_FILE "kernels.cl"
#define KERNEL_NAME "test"
#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#define N_PROJ 4000
#define N_CHANNELS 736
#define N_ROWS 32
/* Find a GPU or CPU associated with the first available platform */
cl_device_id create_device() {
cl_platform_id platform;
cl_device_id dev;
int err;
/* Identify a platform */
err = clGetPlatformIDs(1, &platform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
/* Access a device */
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
perror("Just a heads up: I'm not going to run on the GPU");
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &dev, NULL);
}
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
cl_ulong16 alloc_size,mem_size;
char name[40];
clGetDeviceInfo(dev,CL_DEVICE_MAX_MEM_ALLOC_SIZE,sizeof(cl_ulong16),&alloc_size,NULL);
clGetDeviceInfo(dev,CL_DEVICE_NAME,sizeof(name),name,NULL);
clGetDeviceInfo(dev,CL_DEVICE_GLOBAL_MEM_SIZE,sizeof(cl_ulong16),&mem_size,NULL);
printf("Using device: %s\n",name);
printf("Global memory size: %lu\n",mem_size);
printf("Max. allocation: %lu\n",alloc_size);
return dev;
}
/* Create program from a file and compile it */
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if(program_handle == NULL) {
perror("Couldn't find the program file");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle)-13;
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if(err < 0) {
perror("Couldn't create the program");
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if(err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
int main(int argc, const char * argv[])
{
/* This file serves as a backbone for OpenCL programs. */
/* All the user needs to do is enter their OpenCL data */
/* structures, set kernel args, and kernel dispatches. */
/* Standard OCL structures */
cl_device_id device;
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
device=create_device();
context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
program = build_program(context, device, PROGRAM_FILE);
queue = clCreateCommandQueue(context, device,0, NULL);
kernel = clCreateKernel(program, KERNEL_NAME, NULL);
/* User code goes here */
cl_int err;
/* Declare and set data */
int a[]={1,2,3};
float *rebin;
rebin =(float*) calloc(N_PROJ*N_CHANNELS*N_ROWS,sizeof(float));
float *mat;
mat =(float*) calloc(N_PROJ*N_CHANNELS*N_ROWS,sizeof(float));
printf("\nAllocation size: %lu\n",N_PROJ*N_CHANNELS*N_ROWS*sizeof(float));
/* Declare and set buffer objects */
cl_mem a_buff,rebin_buff,mat_buff;
printf("Total memory to be allocated: %lu\n",2*N_PROJ*N_CHANNELS*N_ROWS*sizeof(float)+sizeof(a) );
a_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,sizeof(a),a,NULL);
rebin_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),rebin,&err);
if (err<0){
printf("Error: %i\n",err);
perror("Couldn't create buffer 1");
exit(1);
}
mat_buff =clCreateBuffer(context,CL_MEM_COPY_HOST_PTR|CL_MEM_READ_WRITE,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),mat ,&err);
if (err<0){
printf("Error: %i\n",err);
perror("Couldn't create buffer 2");
exit(1);
}
/* Copy data over to the device */
err=clSetKernelArg(kernel,0,sizeof(cl_mem),&mat_buff);
if (err<0){
perror("Couldn't set kernel argument");
exit(1);
}
err=clSetKernelArg(kernel,1,sizeof(cl_mem),&rebin_buff);
err=clSetKernelArg(kernel,2,sizeof(cl_mem),&a_buff);
clEnqueueTask(queue,kernel,0,NULL,NULL);
clEnqueueReadBuffer(queue,mat_buff ,CL_TRUE,0,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),mat ,0,NULL,NULL);
clEnqueueReadBuffer(queue,rebin_buff,CL_TRUE,0,N_PROJ*N_CHANNELS*N_ROWS*sizeof(float),rebin,0,NULL,NULL);
clEnqueueReadBuffer(queue,a_buff, CL_TRUE,0,sizeof(a),a,0,NULL,NULL);
printf("%f %f %f\n",mat[1],mat[2],mat[3]);
printf("%f %f %f\n",rebin[1],rebin[2],rebin[3]);
printf("%i %i %i",a[0],a[1],a[2]);
/***********************/
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
//clReleaseDevice(device);
printf("\n\nProgram apparently executed fully. \n");
return 0;
}
内核:
__kernel void test(__global float *mat,__global float *rebin,__global int *a){
a[0]=3;
a[1]=2;
a[2]=1;
rebin[1]=1.0f;
rebin[2]=2.0f;
rebin[3]=3.0f;
mat[1]=3.0f;
mat[2]=2.0f;
mat[3]=1.0f;
}
我的机器的控制台输出:
Using device: GeForce GTX 780
Global memory size: 3221225472
Max. allocation: 805306368
Allocation size: 376832000
Total memory to be allocated: 753664012
Error: -6
Couldn't create buffer 2: No error
Process returned 1 (0x1) execution time : 0.435 s
Press any key to continue.