您好,我是Open Cl的新手,我正尝试在 IMX8
上的GPU Vivante7200 Lite上测量GFLOPS。使用此代码的输出是0.127989 GFLOPS,而发布的FLOPS是32 那么这段代码可能出了什么问题。
这是内核代码
__kernel void flops(__global float* input , __global float* output) {
int gid = get_global_id(0);
float scale = 2.35;
float x = *input;
int i=0;
for(;i<62500000;++i)
{
scale = scale * (x);
}
output[gid] = scale;
}
这是主机程序
#include "gpu_test.h"
int main ()
{
gpu_intialize();
cl_int errNum;
cl_platform_id platform; /* OpenCL platform. */
cl_device_id device; /* OpenCL device. */
cl_context context; /* OpenCL context. */
cl_command_queue commandQueue; /* OpenCL command queue. */
cl_program program; /* OpenCL program. */
cl_kernel kernel; /* OpenCL kernel. */
cl_mem memObject_input; /* OpenCL memory buffer objects. */
cl_mem memObject_output; /* OpenCL memory buffer objects. */
cl_event eventFlops; /* OpenCL event. */
cl_event eventWrite;
long time0=0;
long time1=0;
size_t nSize=0;
char *buf;
size_t wg_size=0;
size_t wg_multiple=0;
size_t globalWorkSize[1] = {16};
size_t localWorkSize[1] = {1};
double flopCount=0;
double consumed_time_in_nano_seconds=0;
double consumed_time_in_seconds=0;
double flop=0;
double Gflop=0;
cl_long maxAllocSizeBytes = 0;
cl_long maxComputeUnits = 0;
cl_long maxWorkGroupSize = 0;
cl_ulong end=0;
cl_ulong start=0;
cl_ulong private_usage=0;
cl_ulong local_usage=0;
printf("Initializing OpenCL...\n");
/* Get the available platform. */
errNum = clGetPlatformIDs(1, &platform, NULL);
clmCHECKERROR(errNum, CL_SUCCESS);
/* Get a GPU device. */
errNum = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
clmCHECKERROR(errNum, CL_SUCCESS);
/* Create the context. */
context = clCreateContext(0, 1, &device, NULL, NULL, &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);
/* Create a command-queue. */
commandQueue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);
printf("Creating program...\n");
printf("Creating program with binary...\n");
nSize = kernel_binary_program_size;
buf = (char *)&kernel_binary_program;
program = clCreateProgramWithBinary(context, 1, &device, &nSize, (const unsigned char**)&buf, NULL, NULL);
if (program == NULL)
{
printf("program creation failed \n");
}
printf("Program created successfully! \n");
kernel = clCreateKernel (program, "flops", &errNum);
clmCHECKERROR(errNum, CL_SUCCESS);
printf("Kernel created successfully! \n");
// Identify maximum size of the global memory on the device side
clGetDeviceInfo (device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(cl_long), &maxAllocSizeBytes, NULL);
clGetDeviceInfo (device, CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(cl_long), &maxComputeUnits, NULL);
clGetDeviceInfo (device, CL_DEVICE_MAX_WORK_GROUP_SIZE,
sizeof(cl_long), &maxWorkGroupSize, NULL);
printf("Max compute unit is %d \n", maxComputeUnits);
printf("Max Work Group size is %d \n", maxWorkGroupSize);
// Creating buffer on the device side
float *input_buffer =(float *) malloc (sizeof(float));
float *output_buffer =(float *) malloc (sizeof(float));
input_buffer[0] = 5.36;
output_buffer[0] = 0;
printf("Host buffer been prepared! \n");
memObject_input = clCreateBuffer (context, CL_MEM_READ_ONLY,
sizeof(float), NULL, &errNum);
clmCHECKERROR(errNum,CL_SUCCESS);
memObject_output = clCreateBuffer (context, CL_MEM_WRITE_ONLY,
sizeof(float), NULL, &errNum);
clmCHECKERROR(errNum,CL_SUCCESS);
if ((memObject_input == NULL) || (memObject_output == NULL)) {
printf ("Error creating memory objects \n");
return false;
}
errNum = clEnqueueWriteBuffer (commandQueue, memObject_input, CL_FALSE, 0, sizeof(float),
input_buffer, 0, NULL, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
errNum = clWaitForEvents (1, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
clReleaseEvent(eventWrite);
errNum = clEnqueueWriteBuffer (commandQueue, memObject_output, CL_FALSE, 0, sizeof(float),
output_buffer, 0, NULL, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
errNum = clWaitForEvents (1, &eventWrite);
clmCHECKERROR(errNum,CL_SUCCESS);
clReleaseEvent(eventWrite);
errNum = clSetKernelArg (kernel, 0, sizeof(cl_mem), &memObject_input);
clmCHECKERROR(errNum, CL_SUCCESS);
errNum = clSetKernelArg (kernel, 1, sizeof(cl_mem), &memObject_output);
clmCHECKERROR(errNum, CL_SUCCESS);
errNum = clGetKernelWorkGroupInfo (kernel, device,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof (wg_size), &wg_size, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);
errNum = clGetKernelWorkGroupInfo (kernel, device,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
sizeof (wg_multiple), &wg_multiple, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);
errNum = clGetKernelWorkGroupInfo (kernel, device,
CL_KERNEL_LOCAL_MEM_SIZE,
sizeof (local_usage), &local_usage, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);
errNum = clGetKernelWorkGroupInfo (kernel, device,
CL_KERNEL_PRIVATE_MEM_SIZE,
sizeof (private_usage), &private_usage, NULL);
clmCHECKERROR (errNum, CL_SUCCESS);
printf("Work Group size is %d \n",wg_size);
printf("Preferred Work Group size is %d \n",wg_multiple);
printf("Local memory size is %d \n",local_usage);
printf("Private memory size is %d \n",private_usage);
errNum = clEnqueueNDRangeKernel (commandQueue, kernel, 1, NULL,
globalWorkSize, localWorkSize,
0, NULL, &eventFlops);
clmCHECKERROR (errNum, CL_SUCCESS);
printf("Waiting for execution to finish \n");
errNum = clWaitForEvents(1, &eventFlops);
clmCHECKERROR(errNum, CL_SUCCESS);
errNum = clGetEventProfilingInfo(eventFlops, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, 0);
errNum |= clGetEventProfilingInfo(eventFlops, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, 0);
clmCHECKERROR(errNum, CL_SUCCESS);
printf("Kernel execution terminated successfully! \n");
flopCount = 62500000 * 16; // 16 work item
consumed_time_in_nano_seconds = (double)(end - start);
consumed_time_in_seconds = consumed_time_in_nano_seconds/1000000000;
printf("consumed_time_in_nano_seconds : %f \n",consumed_time_in_nano_seconds);
printf("consumed_time_in_seconds : %f \n",consumed_time_in_seconds);
flop = flopCount / consumed_time_in_seconds;
Gflop = flop / 1000000000;
printf("Floating Operations per second : %f \n",flop);
printf("Giga Floating Operations per second : %f \n",Gflop);
cleanup(context,program,kernel,memObject_input,memObject_output,eventFlops);
printf ("Program executed successfully! \n");
}
void cleanup (cl_context context,
cl_program program,
cl_kernel kernel,
cl_mem memObject_input,
cl_mem memObject_output,
cl_event eventFlops) {
if ( eventFlops != NULL)
clReleaseEvent(eventFlops);
if (memObject_input != NULL)
clReleaseMemObject (memObject_input);
if (memObject_output != NULL)
clReleaseMemObject (memObject_output);
if (kernel != NULL)
clReleaseKernel (kernel);
if (program != NULL)
clReleaseProgram (program);
}
void checkError(cl_int Value,cl_int Reference,const char* FileName,const
int LineNumber)
{
if (Reference != Value)
{
printf("\n !!! Error # %i at line %i , in file %s !!!\n\n",
Value, LineNumber, FileName);
printf("Exiting...\n");
exit(EXIT_FAILURE);
}
}
void gpu_intialize()
{
Error error;
printf("[GalCore] GalCore_TaskInit\n");
error = GalCore_TaskInit();
if (error == 0)
printf("[GalCore] GalCore_TaskInit successful \ne");
else
printf("[GalCore] GalCore_TaskInit Failed \n");
printf("[GalCore] VivanteInit\n");
VivanteInit();
printf("[GalCore] VivanteInit successful \n");
}
所以我想知道该实现可能是什么问题
答案 0 :(得分:1)
无特殊顺序:
getTime()
)来衡量设备上代码的性能。在clGetEventProfilingInfo()
上使用eventFlops
。eventFlops
事件,以免发生记忆泄漏