我一直在尝试在MAC,Ubuntu和其他平台上分析用于FIR过滤的OpenCL主机代码。我的主机代码和内核如下所示。
问题在于,无论我为FIR滤波器提供的采样数量如何,clenquendrangelernel都会花费相同的时间。此外,我也分析了clEnqueueReadBuffer和clEnqueueWriteBuffer,不知怎的,他们最终也花了相同的时间。在mac中我使用mach进行分析以及使用OpenCL事件,在ubuntu中,我正在使用PAPI进行分析。我无法理解为什么会发生这种情况,理想情况是随着样本数量的增加,clEnqueueReadBuffer和clEnqueueWriteBuffer应该花费更多时间,因此应该执行内核。
内核: -
__kernel void fir4(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[4] = {5,7,5,7};
/*for(j=0;j<4;j++)
{
output[i] += coeff[j]*(input[i+4-j-1]);
}*/
//unrolled
output[i] += coeff[0]*(input[i+4-0-1]);
output[i] += coeff[1]*(input[i+4-1-1]);
output[i] += coeff[2]*(input[i+4-2-1]);
output[i] += coeff[3]*(input[i+4-3-1]);
}
__kernel void fir8(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[8] = {5,7,5,7,5,7,5,7};
for(j=0;j<8;j++)
{
output[i] += coeff[j]*(input[i+8-j-1]);
}
}
__kernel void fir12(
__global float* input,
__global float* output)
{
int i = get_global_id(0);
int j = 0;
int coeff[12] = {5,7,5,7,5,7,5,7,5,7,5,7};
for(j=0;j<12;j++)
{
output[i] += coeff[j]*(input[i+12-j-1]);
}
}
主机代码: -
// Use a static data size for simplicity
//
#define DATA_SIZE (48000)
#define NUM_COEFF (4)
int main(int argc, char** argv)
{
uint64_t start;
uint64_t end;
uint64_t elapsed;
double elapsedmilli;
int err; // error code returned from api calls
float data[DATA_SIZE]; // original data set given to device
float coeff[NUM_COEFF];
float results_host[DATA_SIZE] = {};
float results[DATA_SIZE]; // results returned from device
unsigned int correct; // number of correct results returned
size_t global; // global domain size for our calculation
size_t local; // local domain size for our calculation
cl_event event; //Linking event to kernel for profiling
cl_platform_id platform_id = NULL; // compute device platform id
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
cl_mem input; // device memory used for the input array
cl_mem output; // device memory used for the output array
// Fill our data set with random float values
//
int i,j = 0;
unsigned int count = DATA_SIZE;
unsigned int taps = NUM_COEFF;
for(i = 0; i < count; i++)
data[i] = rand() / (float)RAND_MAX;
for(i=0; i < taps; i++)
{
if(!(i%2))
coeff[i] = 5;
else
coeff[i] = 7;
}
//Connect to a platform on device
err = clGetPlatformIDs(1, &platform_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to locate opencl platform!\n");
return EXIT_FAILURE;
}
// Connect to a compute device
//
int gpu = 0;
err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return EXIT_FAILURE;
}
// Create a compute context
//
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return EXIT_FAILURE;
}
// Create a command commands
//
commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return EXIT_FAILURE;
}
//Use function and load the kernel source from .cl files in the same folder
//
char *KernelSource = load_program_source("fir.cl");
// Create the compute program from the source buffer
//
program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err);
if (!program)
{
printf("Error: Failed to create compute program!\n");
return EXIT_FAILURE;
}
// Build the program executable
//
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
printf("Error: Failed to build program executable!\n");
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
printf("%s\n", buffer);
exit(1);
}
// Create the compute kernel in the program we wish to run
//
switch(taps)
{
case(4):
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
case(8):
{
kernel = clCreateKernel(program, "fir8", &err);
break;
}
case(12):
{
kernel = clCreateKernel(program, "fir12", &err);
break;
}
default:
{
kernel = clCreateKernel(program, "fir4", &err);
break;
}
}
if (!kernel || err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel! - %d\n",err);
exit(1);
}
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
local = 48;
start = mach_absolute_time();
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err)
{
printf("Error: Failed to execute kernel!-%d\n",err);
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clWaitForEvents(1, &event);
clFinish(commands);
end = mach_absolute_time();
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = time_end - time_start;
printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0);
elapsed = end - start;
struct mach_timebase_info info;
mach_timebase_info(&info);
double t = 1e-9 * (elapsed) * info.numer / info.denom;
elapsedmilli = 1e-6 * (elapsed) * info.numer / info.denom;
printf("cl:main timing:MACH clEnqueueNDRangeKernel %f ms, %d elapsed\n",elapsedmilli,elapsed);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i=0; i<DATA_SIZE; i++)
{
for(j=0;j<NUM_COEFF;j++)
{
results_host[i]+=coeff[j]*(data[i+NUM_COEFF-j-1]);
}
//printf("Host Output[%d]-%f\n",i,results_host[i]);
}
for(i = 0; i < count; i++)
{
if(results[i] == results_host[i])
correct++;
//printf("CL Output[%d]-%f\n",i,results[i]);
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values! Samples-%d,Taps-%d\n", correct, count, DATA_SIZE, NUM_COEFF);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
答案 0 :(得分:0)
每个项目仅添加10-20次乘法和加法与内核开销时间不可比。尝试使用100或1000宽系数阵列。
使用这种方式为每个项目使用更多输入元素,只需增加缓存命中数(也是比率),因为更多线程从相同位置读取。
如果DATA_SIZE是几百万,那么所有数据都无法适应缓存,并且随着其长度线性变慢。 48000意味着小于200kB。例如,HD5850具有512 k二级高速缓存(3倍内存带宽)和每个计算单元8kB L1(太快)。