我正在尝试使用CodeXL(或更多点到sprofile)分析一些Opencl代码。在性能计数器模式下进行性能分析时,这总是给出错误的输出(但在使用跟踪选项-t
时却没有),所以我试图找出原因。经过一些实验后,我得出结论,每个内核执行三次导致内核错误的结果,修改一些现有数据而不是覆盖它。以下玩具程序展示了这种行为。
我的问题是:anyohne是否知道为什么会这样做以及如何阻止它这样做?
我的操作系统是Fedora Linux 18 CodeXL版本:CodeXL-Linux-1.1.1537.0 显卡:ATI Technologies Inc Device 6798
这是执行命令:
/opt/CodeXL-Linux-1.1.1537.0-x86_64-release/Output_x86_64/release/bin/x86_64/sprofile -o example.csv -w . OpenCLExample
我的代码:
cl_context CreateContext()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
errNum = clGetPlatformIDs(1,&firstPlatformId, &numPlatforms);
cl_context_properties contextProperties[] =
{
CL_CONTEXT_PLATFORM,
(cl_context_properties)firstPlatformId,
0
};
context = clCreateContextFromType(contextProperties,CL_DEVICE_TYPE_GPU,
NULL,NULL,&errNum);
return context;
}
cl_command_queue CreateCommandQueue(cl_context context,cl_device_id *device)
{
cl_int errNum;
cl_device_id *devices;
cl_command_queue commandQueue = NULL;
size_t deviceBufferSize = -1;
errNum = clGetContextInfo(context,CL_CONTEXT_DEVICES,0,NULL,&deviceBufferSize);
devices = new cl_device_id[deviceBufferSize/sizeof(cl_device_id)];
errNum = clGetContextInfo(context,CL_CONTEXT_DEVICES,deviceBufferSize,devices,NULL);
commandQueue = clCreateCommandQueue(context,devices[0],0,NULL);
*device = devices[0];
delete[] devices;
return commandQueue;
}
cl_program CreateProgram(cl_context context,cl_device_id device,const char* filename)
{
cl_int errNum;
cl_program program;
std::ifstream kernelFile(filename,std::ios::in);
kernelFile.is_open();
std::ostringstream oss;
oss << kernelFile.rdbuf();
std::string srcStdStr = oss.str();
const char *srcStr = srcStdStr.c_str();
program = clCreateProgramWithSource(context,1,
(const char**)&srcStr,
NULL,NULL);
errNum = clBuildProgram(program,0,NULL,NULL,NULL,NULL);
return program;
}
bool CreateMemObjects(cl_context context,cl_mem memObjects[3],float *a,float *b)
{
memObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float)*ARRAY_SIZE,a,NULL);
memObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float)*ARRAY_SIZE,b,NULL);
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float)*ARRAY_SIZE,NULL,NULL);
return true;
}
int main(int arg,char** argv)
{
cl_context context=0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[3] = {0,0,0};
cl_int errNum;
context = CreateContext();
commandQueue = CreateCommandQueue(context,&device);
program = CreateProgram(context,device,"Example.cl");
kernel = clCreateKernel(program,"example_kernel",NULL);
float result[ARRAY_SIZE];
float a[ARRAY_SIZE];
float b[ARRAY_SIZE];
for(int i=0;i<ARRAY_SIZE;i++)
{
a[i] = i;
b[i] = i*2;
}
if(!CreateMemObjects(context,memObjects,a,b))
return 1;
errNum = clSetKernelArg(kernel,0,sizeof(cl_mem),&memObjects[0]);
errNum |= clSetKernelArg(kernel,1,sizeof(cl_mem),&memObjects[1]);
errNum |= clSetKernelArg(kernel,2,sizeof(cl_mem),&memObjects[2]);
size_t globalWorkSize[1] = {ARRAY_SIZE};
size_t localWorkSize[1] = { 1 };
errNum = clEnqueueNDRangeKernel(commandQueue,kernel,1,NULL,globalWorkSize,localWorkSize,0,
NULL,NULL);
errNum = clEnqueueReadBuffer(commandQueue,memObjects[2], CL_TRUE,
0,ARRAY_SIZE*sizeof(float),result,
0,NULL,NULL);
return 0;
}
内核:
#pragma OPENCL EXTENSION cl_amd_printf : enable
kernel void example_kernel(global const float *a,
global const float *b,
global float *result)
{
int gid = get_global_id(0);
result[gid] = a[gid] * b[gid];
printf((__constant char *)"DEBUG: example_kernel id: %d result: %g\n", gid, result[gid]);
}
这就是我得到的结果:
DEBUG: example_kernel id: 0 result: 0
DEBUG: example_kernel id: 1 result: 2
DEBUG: example_kernel id: 2 result: 8
DEBUG: example_kernel id: 3 result: 18
DEBUG: example_kernel id: 0 result: 0
DEBUG: example_kernel id: 1 result: 2
DEBUG: example_kernel id: 2 result: 8
DEBUG: example_kernel id: 3 result: 18
DEBUG: example_kernel id: 0 result: 0
DEBUG: example_kernel id: 1 result: 2
DEBUG: example_kernel id: 2 result: 8
DEBUG: example_kernel id: 3 result: 18
答案 0 :(得分:1)
它的行为方式是因为GPU分析器重放内核几次,以便能够收集所有相关硬件计数器的值(在单个计算机中可以查询的硬件计数器数量有限制)内核的运行)。这里提供了一个有用的答案:http://devgurus.amd.com/message/1297746