我已经完成了关于Cuda的课程,现在我试图通过将我的代码从类移植到openCL来学习一些openCL,但是我在OpenCL上使用本地内存时遇到了麻烦。下面的内核输出错误的结果,而具有相同主机代码的更简单的内核输出正确的答案。我曾尝试使用1D工作组等重写内核,但结果却相同。我正在从一个名为“matrixMultTiled.cl”的单独文件中加载内核。
内核代码:
#define tW 16
__kernel void matrixMult(int HeightA,int WidthA, int WidthB,
__global float *A, __global float *B,
__global float *C)
{
int t,k;
float sum;
int tx = get_local_id(1), ty = get_local_id(0);
int row = get_global_id(1), col = get_global_id(0);
__local float sA[tW][tW],sB[tW][tW];
sum=0.;
for(t=0; t < (WidthA-1)/tW+1; t+=1){
if( (row<HeightA) && (t*tW+tx<WidthA) )
sA[ty][tx] = A[row*WidthA+(t*tW+tx)];
else
sA[ty][tx] = 0.;
if( (t*tW+ty<WidthA) && (col<WidthB) )
sB[ty][tx] = B[(t*tW+ty)*WidthB+col];
else
sB[ty][tx] = 0.;
barrier(CLK_LOCAL_MEM_FENCE);
for(k=0;k<tW;k+=1)
sum += sA[ty][k]*sB[k][tx];
barrier(CLK_LOCAL_MEM_FENCE);
}
if((row<HeightA)&&(col<WidthB))
C[row*WidthB+col] = sum;
}
简单内核代码:
__kernel void matrixMult(int HeightA,int WidthA, int WidthtB,
__global float *A, __global float *B,
__global float *C)
{
int k;
float sum;
int row = get_global_id(1);
int col = get_global_id(0);
if((row<=HeightA)&&(col<=WidthtB)){
sum=0.0f;
for(k=0;k<WidthA;k+=1)
sum += A[row*WidthA+k]*B[k*WidthtB+col];
C[row*WidthtB+col] = sum;
}
}
主机代码:
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>
#include <stdarg.h>
#include <stdbool.h>
#include <CL/cl.h>
#define DEBUG_MODE false
#define clCheck(stmt) if((stmt)!= CL_SUCCESS){printf("%s\n",getErrorString(stmt));}
#define clExec(expr) {status=expr clCheck(status); if(status!=CL_SUCCESS){return status;}}
#define clGuard(expr) {expr clCheck(status); if(status!=CL_SUCCESS){return status;}}
const char *getErrorString(cl_int error);
char* read_file(const char *filename);
int platformInit(cl_uint *numPlatformsOut, cl_uint *numDevicesOut,
cl_platform_id **platformsOut, cl_device_id **devicesOut);
int contextQueueInit(cl_uint numDevices, cl_device_id *devices,int dev,
cl_context *contextOut, cl_command_queue *cmdQueueOut);
int initSingleKernelProgram(const char *progFile,cl_context context,
size_t numDev, cl_device_id* devices,
const char *kernelName,cl_program *programOut,
cl_kernel *kernelOut);
int main(){
const char *kernelFile="matrixMultTiled.cl";
int i,j,k,err,L,M,N;
int WidthA=10,HeightA=10,WidthB=10;
size_t offset=0,datasize, kerCount=1,*lenghts=NULL;
float *hA,*hB,*hC;
char *programSource;
bool result=true;
cl_int status;
cl_bool blocking;
cl_uint numPlat, numDev;
cl_platform_id *platforms;
cl_device_id *devices;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_mem dA,dB,dC;
L=3;
M=3;
N=3;
datasize = L*M*sizeof(float);
hA = (float*)malloc(datasize);
datasize = M*N*sizeof(float);
hB = (float*)malloc(datasize);
datasize = L*N*sizeof(float);
hC = (float*)malloc(datasize);
for(i=0;i<L;i+=1)
for(j=0;j<M;j+=1)
hA[i*M+j] = i+j;
for(i=0;i<M;i+=1)
for(j=0;j<N;j+=1)
hB[i*M+j] = i*j;
err=platformInit(&numPlat,&numDev,&platforms,&devices);
if(err!=0){printf("problem in platform inicialization\n");return err;}
err=contextQueueInit(numDev, devices,0,&context, &queue);
if(err!=0){printf("problem in context and queue inicialization\n");return err;}
err = initSingleKernelProgram(kernelFile,context,numDev,
devices,"matrixMult",&program,&kernel);
if(err!=0){printf("problem with program or kernel init\n"); return err;}
datasize = L*M*sizeof(float);
clGuard(dA =clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);)
datasize = M*N*sizeof(float);
clGuard(dB =clCreateBuffer(context,CL_MEM_READ_ONLY,datasize,NULL,&status);)
datasize = L*N*sizeof(float);
clGuard(dC =clCreateBuffer(context,CL_MEM_WRITE_ONLY,datasize,NULL,&status);)
offset=0;
blocking = CL_TRUE;
// last 3 arguments : 0 events in event_list, no wait_list, no event
datasize = L*M*sizeof(float);
clExec(clEnqueueWriteBuffer(queue,dA,blocking,offset,datasize,
hA,0,NULL,NULL););
// last 3 arguments : 0 events in event_list, no wait_list, no event
datasize = M*N*sizeof(float);
clExec(clEnqueueWriteBuffer(queue,dB,blocking,offset,datasize,
hB,0,NULL,NULL);)
status = clSetKernelArg(kernel,0, sizeof(cl_int), &L);
status |= clSetKernelArg(kernel,1, sizeof(cl_int), &M);
status |= clSetKernelArg(kernel,2, sizeof(cl_int), &N);
status |= clSetKernelArg(kernel,3, sizeof(cl_mem), &dA);
status |= clSetKernelArg(kernel,4, sizeof(cl_mem), &dB);
status |= clSetKernelArg(kernel,5, sizeof(cl_mem), &dC);
clCheck(status); if(status!=CL_SUCCESS){
printf("Problem Setting program Arguments\n");
return status;
}
size_t localWS[2],globalWS[2];
localWS[0] =16; localWS[1]=16;
globalWS[0] = ((L/16)+1)*16; globalWS[1]= ((N/16)+1)*16;
clExec(clEnqueueNDRangeKernel(queue, kernel, 2, NULL,
globalWS, localWS, 0, NULL, NULL);)
blocking = CL_TRUE;
datasize = L*N*sizeof(float);
clGuard( clEnqueueReadBuffer(queue,dC,blocking,0,datasize,
hC,0,NULL,NULL); )
// check the result
printf("result:\n");
for(i=0;i<L;i+=1){
for(j=0;j<N;j+=1)
printf("%f ",hC[i*N+j]);
printf("\n");
}
// Free OpenCL resources
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseMemObject(dA);
clReleaseMemObject(dB);
clReleaseMemObject(dC);
// Free host resources
free(hA);
free(hB);
free(hC);
free(platforms);
free(devices);
return 0;
}
const char *getErrorString(cl_int error){
switch(error){
// run-time and JIT compiler errors
case 0: return "CL_SUCCESS";
case -1: return "CL_DEVICE_NOT_FOUND";
case -2: return "CL_DEVICE_NOT_AVAILABLE";
case -3: return "CL_COMPILER_NOT_AVAILABLE";
case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
case -5: return "CL_OUT_OF_RESOURCES";
case -6: return "CL_OUT_OF_HOST_MEMORY";
case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
case -8: return "CL_MEM_COPY_OVERLAP";
case -9: return "CL_IMAGE_FORMAT_MISMATCH";
case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
case -11: return "CL_BUILD_PROGRAM_FAILURE";
case -12: return "CL_MAP_FAILURE";
case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
case -15: return "CL_COMPILE_PROGRAM_FAILURE";
case -16: return "CL_LINKER_NOT_AVAILABLE";
case -17: return "CL_LINK_PROGRAM_FAILURE";
case -18: return "CL_DEVICE_PARTITION_FAILED";
case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
// compile-time errors
case -30: return "CL_INVALID_VALUE";
case -31: return "CL_INVALID_DEVICE_TYPE";
case -32: return "CL_INVALID_PLATFORM";
case -33: return "CL_INVALID_DEVICE";
case -34: return "CL_INVALID_CONTEXT";
case -35: return "CL_INVALID_QUEUE_PROPERTIES";
case -36: return "CL_INVALID_COMMAND_QUEUE";
case -37: return "CL_INVALID_HOST_PTR";
case -38: return "CL_INVALID_MEM_OBJECT";
case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
case -40: return "CL_INVALID_IMAGE_SIZE";
case -41: return "CL_INVALID_SAMPLER";
case -42: return "CL_INVALID_BINARY";
case -43: return "CL_INVALID_BUILD_OPTIONS";
case -44: return "CL_INVALID_PROGRAM";
case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
case -46: return "CL_INVALID_KERNEL_NAME";
case -47: return "CL_INVALID_KERNEL_DEFINITION";
case -48: return "CL_INVALID_KERNEL";
case -49: return "CL_INVALID_ARG_INDEX";
case -50: return "CL_INVALID_ARG_VALUE";
case -51: return "CL_INVALID_ARG_SIZE";
case -52: return "CL_INVALID_KERNEL_ARGS";
case -53: return "CL_INVALID_WORK_DIMENSION";
case -54: return "CL_INVALID_WORK_GROUP_SIZE";
case -55: return "CL_INVALID_WORK_ITEM_SIZE";
case -56: return "CL_INVALID_GLOBAL_OFFSET";
case -57: return "CL_INVALID_EVENT_WAIT_LIST";
case -58: return "CL_INVALID_EVENT";
case -59: return "CL_INVALID_OPERATION";
case -60: return "CL_INVALID_GL_OBJECT";
case -61: return "CL_INVALID_BUFFER_SIZE";
case -62: return "CL_INVALID_MIP_LEVEL";
case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
case -64: return "CL_INVALID_PROPERTY";
case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
case -66: return "CL_INVALID_COMPILER_OPTIONS";
case -67: return "CL_INVALID_LINKER_OPTIONS";
case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
// extension errors
case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
default: return "Unknown OpenCL error";
}
}
char* read_file(const char *filename){
int err,offset=0;
size_t size;
FILE *file = fopen(filename,"r");
if(file==NULL)
return NULL;
err=fseek(file,offset,SEEK_END);
if(err<0){fclose(file); return NULL;}
size = ftell(file); // anotating the end of File
err=fseek(file,offset,SEEK_SET);
if(err<0){fclose(file); return NULL;}
char *content = (char*) malloc((size+1)*sizeof(char));
if(err<0){fclose(file); return NULL;}
err=fread(content,sizeof(char),size,file);
if(err<0){fclose(file); free(content); return NULL;}
err=fclose(file);
content[size]='\0';
return content;
}
int platformInit(cl_uint *numPlatformsOut, cl_uint *numDevicesOut,
cl_platform_id **platformsOut, cl_device_id **devicesOut)
{
cl_int status,i,j;
cl_uint numPlatforms = 0, numDevices = 0;
cl_platform_id *platforms = NULL;
cl_device_id *devices = NULL;
// first call -- get the number of platforms
status = clGetPlatformIDs(0,NULL,&numPlatforms);
clCheck(status);
if(status!=CL_SUCCESS){return (status);}
platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id));
if(platforms==NULL){return (-2001);}
// second call -- get the platforms information
status = clGetPlatformIDs(numPlatforms,platforms,NULL);
clCheck(status);
if(status!=CL_SUCCESS){free(platforms); return (status);}
if(DEBUG_MODE==true)
printf("numPlatforms = %d\n",numPlatforms);
for(i=0;i<numPlatforms;i+=1){
char buf[1024+1];
cl_uint dev_count;
status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(buf),
buf, NULL);
clCheck(status);
if(status!=CL_SUCCESS){free(platforms); return (status);}
if(DEBUG_MODE == true)
printf("platform %d: vendor '%s'\n",i,buf);
status = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL,0,
NULL,&dev_count);
clCheck(status);
if(status!=CL_SUCCESS){free(platforms); return (status);}
devices = (cl_device_id*)malloc(dev_count*sizeof(cl_device_id));
if(devices==NULL){free(platforms); return (-2002);}
// second call -- get the devices information
status = clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_ALL,dev_count,
devices,NULL);
clCheck(status);
if(status!=CL_SUCCESS){free(platforms); free(devices);return (status);}
for(j=0;j<dev_count;j+=1){
char bufDev[1024+1];
status = clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(bufDev),
bufDev, NULL);
if(DEBUG_MODE==true)
printf(" device %d: '%s'\n",j,bufDev);
}
if(i==0)
numDevices=dev_count;
}
if(DEBUG_MODE==true)
printf("%d %d %p %p\n",numPlatforms, numDevices,platforms,devices);
*numPlatformsOut = numPlatforms;
*numDevicesOut = numDevices;
*platformsOut = platforms;
*devicesOut = devices;
return 0;
}
int contextQueueInit(cl_uint numDevices, cl_device_id *devices,int dev,
cl_context *contextOut, cl_command_queue *cmdQueueOut)
{
cl_int status;
cl_context context = NULL;
cl_command_queue cmdQueue;
// create a context for given devices
context = clCreateContext(NULL,numDevices,devices,NULL,NULL,&status);
clCheck(status);
if(status!=CL_SUCCESS){
if(context==NULL)
return status;
else{
clReleaseContext(context);
return status;
}
}
// create a command queue in this context for device dev
cmdQueue = clCreateCommandQueue(context,devices[dev],0,&status);
clCheck(status);
if(status!=CL_SUCCESS){
if(context==NULL)
return status;
else{
if(cmdQueue!=NULL){
clReleaseCommandQueue(cmdQueue);
}
clReleaseContext(context);
return status;
}
}
*contextOut = context;
*cmdQueueOut = cmdQueue;
return 0;
}
int initSingleKernelProgram(const char *progFile,cl_context context,
size_t numDev, cl_device_id* devices,
const char *kernelName,cl_program *programOut,
cl_kernel *kernelOut)
{
cl_uint status;
char *programSource=NULL;
cl_program program=NULL;
cl_kernel kernel=NULL;
programSource = read_file(progFile);
if(programSource==NULL)
return -2001;
if(DEBUG_MODE==true)
printf("\n\n%s\n\n",programSource);
program = clCreateProgramWithSource(context,1,(const char**)&programSource,
NULL,&status);
clCheck(status);
if(status!=CL_SUCCESS){
free(programSource);
return status;
}
status = clBuildProgram(program,numDev,devices,NULL,NULL,NULL);
clCheck(status);
if(status!=CL_SUCCESS){
if (status == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t logSize;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
// Allocate memory for the log
char *logStr = (char *) malloc(logSize);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, logSize, logStr, NULL);
// Print the log
printf("%s\n", logStr);
}
clReleaseProgram(program);
return status;
}
if(DEBUG_MODE==true){
// Determine the size of the log
size_t logSize;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
// Allocate memory for the log
char *logStr = (char *) malloc(logSize);
// Get the log
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, logSize, logStr, NULL);
// Print the log
printf("%s\n", logStr);
}
kernel = clCreateKernel(program, kernelName,&status);
clCheck(status);
if(status!=CL_SUCCESS){
clReleaseProgram(program);
if(kernel!=NULL)
clReleaseKernel(kernel);
return status;
}
*programOut = program;
*kernelOut = kernel;
free(programSource);
return 0;
}
任何亮点或建议都非常受欢迎