我正在尝试打印GPU上某些功能的执行时间。但GPU上的时序总是为0.此外,当我在下面选择CL_DEVICE_TYPE_CPU时,它工作正常。
errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
这很好并且显示执行时间的非零值但是如果我选择CL_DEVICE_TYPE_GPU,那么它总是显示0,而不管总数是多少。数据点和线程。请注意,在两种情况下(CL_DEVICE_TYPE_CPU和CL_DEVICE_TYPE_GPU),我都以相同的方式打印执行时间。这是我的主机代码,我的内核代码在两种情况下都是相同的(这就是openCL!)。以下是一些代码部分:
// openCL code to get platform and device ids
errcode = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
errcode = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
// to create context
clGPUContext = clCreateContext( NULL, 1, &device_id, NULL, NULL, &errcode);
//Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext,
device_id, CL_QUEUE_PROFILING_ENABLE, &errcode);
// Setup device memory
d_instances= clCreateBuffer(clGPUContext,CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR,mem_size_i,instances->data, &errcode);
d_centroids = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_c, NULL, &errcode);
d_distance = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size_d,NULL, &errcode);
// d_dist_X = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//d_dist_Y = clCreateBuffer(clGPUContext,CL_MEM_READ_WRITE,mem_size4,NULL, &errcode);
//to build program
clProgram = clCreateProgramWithSource(clGPUContext,1, (const char **)&source_str,(const
size_t*)&source_size, &errcode);
errcode = clBuildProgram(clProgram, 0,NULL, NULL, NULL, NULL);
if (errcode == CL_BUILD_PROGRAM_FAILURE)
{
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
&log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(clProgram, device_id, CL_PROGRAM_BUILD_LOG, log_size, log,
NULL);
// Print the log
printf("%s\n", log);
}
clKernel = clCreateKernel(clProgram,"distance_finding", &errcode);
// Launch OpenCL kernel
size_t localWorkSize[1], globalWorkSize[1];
if(num_instances >= 500)
{
localWorkSize[0] = 500;
float block1=num_instances/localWorkSize[0];
int block= (int)(ceil(block1));
globalWorkSize[0] = block*localWorkSize[0];
}
else
{
localWorkSize[0]=num_instances;
globalWorkSize[0]=num_instances;
}
int iteration=1;
while(iteration < MAX_ITERATIONS)
{
errcode = clEnqueueWriteBuffer(clCommandQue,d_centroids , CL_TRUE, 0,
mem_size_c, (void*)centroids->data, 0, NULL, NULL);
errcode = clEnqueueWriteBuffer(clCommandQue,d_distance , CL_TRUE, 0, mem_size_d,
(void*)distance->data, 0, NULL, NULL);
//set kernel arguments
errcode = clSetKernelArg(clKernel, 0,sizeof(cl_mem), (void *)&d_instances);
errcode = clSetKernelArg(clKernel, 1,sizeof(cl_mem), (void *)&d_centroids);
errcode = clSetKernelArg(clKernel, 2,sizeof(cl_mem), (void *)&d_distance);
errcode = clSetKernelArg(clKernel, 3,sizeof(unsigned int), (void *)
&num_instances);
errcode = clSetKernelArg(clKernel,4,sizeof(unsigned int),(void *)&clusters);
errcode = clSetKernelArg(clKernel,5,sizeof(unsigned int),(void *)&dimensions);
errcode = clEnqueueNDRangeKernel(clCommandQue,clKernel, 1, NULL,
globalWorkSize,localWorkSize, 0, NULL, &myEvent);
clFinish(clCommandQue); // wait for all events to finish
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START,sizeof(cl_ulong),
&startTime, NULL);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,sizeof(cl_ulong),
&endTime, NULL);
kernelExecTimeNs = endTime-startTime;
gpu_time+= kernelExecTimeNs;
// Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue,d_distance, CL_TRUE, 0,
mem_size_d,distance->data, 0, NULL, NULL);
以毫秒打印时间
printf("\n\n Time taken by GPU is %llu ms",gpu_time/1000000);
如果我计算GPU时序的方式是错误的,为什么它会在CPU上工作(通过更改为CL_DEVICE_TYPE_CPU)?这有什么不对?
编辑:
系统信息
AMD APP SDK 2.4 AMD ATI FirePro GL 3D,拥有800个核心
Kerenel
#pragma OPENCL EXTENSION cl_khr_fp64:enable
double distance_cal(__local float* cent,float* data,int dimensions)
{
float dist1=0.00;
for(int i=0;i<dimensions;i++)
dist1 += ((data[i]-cent[i]) * (data[i]-cent[i]));
double sq_dist=sqrt(dist1);
return sq_dist;
}
void fetch_col(float* data,__constant float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
col=col+len;
}
}
void fetch_col_cen(__local float* data,__global float* x,int col,int dimension,int len)
{
//hari[i]=8;
for(int i=0;i<dimension;i++)
{
data[i]=x[col];
col=col+len;
}
}
__kernel void distance_finding(__constant float* data,__global float* cen,__global float*
dist,int inst,int clus,const int dimensions)
{
int idx=get_global_id(0);
float data_col[4];
fetch_col( data_col,data,idx,dimensions,inst);
for(int i=0;i<clus;i++)
{
int k=i*inst; // take each dimension value for each cluster data
__local float cent[4];
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
fetch_col_cen(cent,cen,i,dimensions,clus);
dist[idx+k]=distance_cal(cent,data_col,dimensions);// calculate distance wrt
each data n each centroid
}
}
答案 0 :(得分:1)
clEnqueueNDRangeKernel()
如果使用GPU则是异步的,因此您只看到将请求排队但不执行它所花费的时间。
那就是说,我可能错了,但我通常会编写c ++代码来执行时序并将start_time
放在指令之前,而end_time
放在
clFinish(cmd_queue);
就像你使用C ++时序码一样,如果你确定你的GPU不应该在0
秒完成,那将是一个很好的测试。
答案 1 :(得分:0)
一种简单的检查方法是在内核中引入异常长的操作。如果在实际执行中存在明显滞后时,该值显示为零 - 那么您就得到了答案。
那就是说,我believe(尽管所指出的主题是用于Linux,它可能在Windows上也存在用水)你可能需要安装已检测的驱动程序,甚至让系统写入性能计数器。您还可以在nVidia的OpenCL实现上使用CUDA探查器,因为它位于CUDA之上。
答案 2 :(得分:0)
更改为
clFinish(clCommandQue); // wait for all events to finish
// add this after clFinish()
// Ensure kernel execution is finished
clWaitForEvents(1 , &myEvent);
..
double gpu_time = endTime-startTime;
..
printf("\n\n Time taken by GPU is %0.3f ms", gpu_time/1000000.0);