继续我的OpenCL冒险,这是我迄今为止从我的CUDA实施中得到的。我试图检查是否至少第一个内核调用正在工作,但我收到错误 48 并且不知道我错过了什么。我正在关注this page
中的示例KERNEL
__kernel
void clut_distributePixels(__global int *pixelGroup, int c_rows, int c_cols, int c_numColors){
int x = get_global_id(0);
int y = get_global_id(1);
if (x >= c_cols || y >= c_rows) return;
int index = y * c_cols + x;
pixelGroup[index] = index/c_numColors;
}
从文件中读取内核
char *file_contents(const char *filename, int *length){
FILE *f = fopen(filename, "r");
void *buffer;
if (!f) {
fprintf(stderr, "Unable to open %s for reading\n", filename);
return NULL;
}
fseek(f, 0, SEEK_END);
*length = ftell(f);
fseek(f, 0, SEEK_SET);
buffer = malloc(*length+1);
*length = fread(buffer, 1, *length, f);
fclose(f);
((char*)buffer)[*length] = '\0';
return (char*)buffer;
}
CODE
#include <iostream>
#include <OpenCL/OpenCL.h>
#include "Utilities.hpp"
int main(int argc, const char * argv[]){
if (argc < 3) {
std::cout << "Use: {GPU|CPU} nColors" << std::endl;
return 1;
}
/************************************************
HOST SIDE INITIALIZATION
************************************************/
int h_numColors = atoi(argv[2]);
Color *h_image;
int h_rows, h_cols;
if (readText2RGB("LenaOriginal.txt", &h_image, &h_rows, &h_cols) != SUCCESS){
return 1;
}
int *h_pixelGroup = new int[h_rows*h_cols];
Color *h_groupRep = new Color[h_numColors];
Color *h_clutImage = new Color[h_rows*h_cols];
int h_change = 0;
/************************************************
PLATFORM AND DEVICE SETUP
************************************************/
cl_int errorStatus;
//Use the first platform
cl_platform_id platform;
errorStatus = clGetPlatformIDs(1, &platform, NULL);
//Use the first device that matches the type selected
cl_device_id device;
if (strcmp(argv[1], "CPU")){
errorStatus = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
}else if (strcmp(argv[1], "GPU")){
errorStatus = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
}else{
std::cout << "Unknown device type. Choose either CPU or GPU" << std::endl;
return 1;
}
//Define context properties and create context
cl_context_properties contextProps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0};
cl_context context = clCreateContext(contextProps, 1, &device, NULL, NULL, &errorStatus);
//Create the command queue
cl_command_queue queue = clCreateCommandQueue(context, device, 0, &errorStatus);
/************************************************
DEVICE VARIABLE SETUP
************************************************/
cl_mem d_image;
cl_mem d_pixelGroup;
cl_mem d_groupRep;
cl_mem d_clutImage;
cl_mem d_change;
d_image = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(Color)*h_rows*h_cols, h_image, &errorStatus);
d_pixelGroup = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*h_rows*h_cols, NULL, &errorStatus);
d_groupRep = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Color)*h_numColors, NULL, &errorStatus);
d_clutImage = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(Color)*h_rows*h_cols, NULL, &errorStatus);
d_change = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int), NULL, &errorStatus);
/************************************************
CREATE, COMPILE PROGRAM and CREATE KERNEL
************************************************/
int pl;
size_t sourceLength;
char * sourceCode = file_contents("vectorQuantization.cl", &pl);
sourceLength = (size_t)pl;
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&sourceCode, &sourceLength, &errorStatus);
errorStatus = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel k_clut_distributePixels = clCreateKernel(program, "clut_distributePixels", &errorStatus);
errorStatus = clSetKernelArg(k_clut_distributePixels, 0, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_distributePixels, 1, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_distributePixels, 2, sizeof(cl_mem), (void*)&h_cols);
errorStatus = clSetKernelArg(k_clut_distributePixels, 3, sizeof(cl_mem), (void*)&h_numColors);
cl_kernel k_clut_checkDistances = clCreateKernel(program, "clut_checkDistances", &errorStatus);
errorStatus = clSetKernelArg(k_clut_checkDistances, 0, sizeof(cl_mem), (void*)&d_image);
errorStatus = clSetKernelArg(k_clut_checkDistances, 1, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_checkDistances, 2, sizeof(cl_mem), (void*)&d_groupRep);
errorStatus = clSetKernelArg(k_clut_checkDistances, 3, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_checkDistances, 4, sizeof(cl_mem), (void*)&h_cols);
errorStatus = clSetKernelArg(k_clut_checkDistances, 5, sizeof(cl_mem), (void*)&h_numColors);
errorStatus = clSetKernelArg(k_clut_checkDistances, 6, sizeof(cl_mem), (void*)&d_change);
cl_kernel k_clut_createImage = clCreateKernel(program, "clut_createImage", &errorStatus);
errorStatus = clSetKernelArg(k_clut_createImage, 0, sizeof(cl_mem), (void*)&d_clutImage);
errorStatus = clSetKernelArg(k_clut_createImage, 1, sizeof(cl_mem), (void*)&d_pixelGroup);
errorStatus = clSetKernelArg(k_clut_createImage, 2, sizeof(cl_mem), (void*)&d_groupRep);
errorStatus = clSetKernelArg(k_clut_createImage, 3, sizeof(cl_mem), (void*)&h_rows);
errorStatus = clSetKernelArg(k_clut_createImage, 4, sizeof(cl_mem), (void*)&h_cols);
/************************************************
EXECUTE PROGRAM AND GET RESULTS
************************************************/
/*STEP 1: evenly distribute pixels among the colors in the CLUT */
size_t grid[2] = {static_cast<size_t>(h_rows), static_cast<size_t>(h_cols)};
errorStatus = clEnqueueNDRangeKernel(queue, k_clut_distributePixels, 2, NULL, grid, NULL, 0, NULL, NULL);
clFinish(queue);
/*********/
/* ERROR */
/*********/
errorStatus = clEnqueueReadBuffer(queue, d_pixelGroup, CL_TRUE, 0, sizeof(int)*h_rows*h_cols, h_pixelGroup, 0, NULL, NULL);
std::cout << h_pixelGroup[7] << ", " << h_pixelGroup[8] << ", " << h_pixelGroup[9] << ", " << h_pixelGroup[10] << std::endl;
//do {
/*STEP 2: compute reprenstative */
/*STEP 3: compute distances and reassign pixel to group */
//copyFromConstantMemory
//} while (h_change != 0);
std::cout << "Done !!" << std::endl;
return 0;
}
答案 0 :(得分:0)
我发现了我的错误。首先,在学习新内容时始终检查返回值。我只记得从我学习CUDA的那一刻起,所以用这个简单的宏开始检查所有内容
#define CL_SUCCESS_OR_RETURN(code) do { \
assert(code == CL_SUCCESS); \
if (code != CL_SUCCESS) { return code; } \
}while (0);
当我检查它是CPU还是GPU时,错误就在最开始。我忘了strcmp在字符串相等时返回0。解决这个问题后,一切都很美妙!!
无论如何,如果您有任何其他建议或建议,或者您发现某些丑陋或不是代码中的最佳做法,请发表评论。