我的内核在Intel HD显卡上运行得非常好。但是,当我想在我的GeForce 960上运行内核时,它会给出CL_OUT_OF_RESOURCES错误。
我已尝试过不同的本地大小,并确保不会超出数组索引,但仍然不知道为什么会发生此错误。你知道为什么我的代码在英特尔上运行正常并且不能在NVIDIA上运行吗?
我的代码中发生的一件奇怪的事情是,我有类似操作的13种情况。出于性能目的,我重复了相同的操作13次,并避免编写循环只是为了保存循环所具有的一些额外操作。当我到达第11个操作时,代码适用于NVIDIA。但是,当我在代码中包含第12个操作时,它会产生上述错误,并且第11个和第12个操作类似!任何想法为什么会发生这种事情?
这是内核:
float2 projectCube(float3 axis, float3 vertex){
float voxelSize = 0.5f;
float2 projection = (float2)(0.0f, 0.0f);
float temp;
//1
temp = axis.x;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//2
temp = axis.x + axis.y;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//3
temp = axis.y;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//4
temp = axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//5
temp = axis.x + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//6
temp = axis.y + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
//7
temp = axis.x + axis.y + axis.z;
if (projection.x > temp){ projection.x = temp; }
else if (projection.y < temp){ projection.y = temp; }
float product = dot(axis, vertex);
projection.x = voxelSize * projection.x + product;
projection.y = voxelSize * projection.y + product;
return projection;
}
float2 projectTriangle(float3 axis, float3 v0, float3 v1, float3 v2){
float2 projection;
projection.x = dot(axis, v0);
projection.y = projection.x;
float temp = dot(axis, v1);
if(projection.x > temp){
projection.x = temp;
}
else if(projection.y < temp){
projection.y = temp;
}
temp = dot(axis, v2);
if (projection.x > temp){
projection.x = temp;
}
else if (projection.y < temp){
projection.y = temp;
}
return projection;
}
float tester(float3 axis, float3 voxel, float3 v0, float3 v1, float3 v2){
float2 voxelProjection = projectCube(axis, voxel);
float2 faceProjection = projectTriangle(axis, v0, v1, v2);
float minProjection = fmin(voxelProjection.x, faceProjection.x);
float maxProjection = fmax(voxelProjection.y, faceProjection.y);
float testResult = maxProjection - minProjection - voxelProjection.y + voxelProjection.x
- faceProjection.y + faceProjection.x;
return testResult;
}
__kernel void voxelizer(size_t global_size,
float h_voxelSize,
__global float* h_minBoundsGrid,
__global int *h_dimGrid,
__global float* coords,
__global int* density)
{
//printf("local size is: %d\n", get_num_groups(0));
int i = get_global_id(0) * 9;
if (i <= global_size * 9){
float voxelSize = h_voxelSize;
float3 minBoundsGrid;
minBoundsGrid.x = h_minBoundsGrid[0];
minBoundsGrid.y = h_minBoundsGrid[1];
minBoundsGrid.z = h_minBoundsGrid[2];
int3 dimGrid;
dimGrid.x = h_dimGrid[0];
dimGrid.y = h_dimGrid[1];
dimGrid.z = h_dimGrid[2];
if ( i %9 == 0){
/*Triangle vertices*/
float3 v0;
v0 = (float3)(coords[i], coords[i + 1], coords[i + 2]);
float3 v1;
v1 = (float3)(coords[i + 3], coords[i + 4], coords[i + 5]);
float3 v2;
v2 = (float3)(coords[i + 6], coords[i + 7], coords[i + 8]);
//printf("i = %d. v0: %f, %f, %f\n", i, v0.x, v0.y, v0.z);
//printf("i = %d. v1: %f, %f, %f\n", i, v1.x, v1.y, v1.z);
//printf("i = %d. v2: %f, %f, %f\n", i, v2.x, v2.y, v2.z);
/*Normal vectors of the each voxel*/
float3 e0;
e0 = (float3)(0.5f, 0.0f, 0.0f);
float3 e1;
e1 = (float3)(0.0f, 0.5f, 0.0f);
float3 e2;
e2 = (float3)(0.0f, 0.0f, 0.5f);
/*Edges of a traingle*/
float3 f0;
f0 = v1 - v0;
float3 f1;
f1 = v2 - v1;
float3 f2;
f2 = v0 - v2;
float3 minLocalGrid;
minLocalGrid.x = fmin(v0.x, fmin(v1.x, v2.x));
minLocalGrid.y = fmin(v0.y, fmin(v1.y, v2.y));
minLocalGrid.z = fmin(v0.z, fmin(v1.z, v2.z));
minLocalGrid.x = voxelSize * floor(minLocalGrid.x / voxelSize);
minLocalGrid.y = voxelSize * floor(minLocalGrid.y / voxelSize);
minLocalGrid.z = voxelSize * floor(minLocalGrid.z / voxelSize);
//printf("i = %d. minLocalGrid = %f, %f, %f.\n", i, minLocalGrid.x, minLocalGrid.y, minLocalGrid.z);
float3 maxLocalGrid;
maxLocalGrid.x = fmax(v0.x, fmax(v1.x, v2.x));
maxLocalGrid.y = fmax(v0.y, fmax(v1.y, v2.y));
maxLocalGrid.z = fmax(v0.z, fmax(v1.z, v2.z));
maxLocalGrid.x = voxelSize * ceil(maxLocalGrid.x / voxelSize);
maxLocalGrid.y = voxelSize * ceil(maxLocalGrid.y / voxelSize);
maxLocalGrid.z = voxelSize * ceil(maxLocalGrid.z / voxelSize);
if (maxLocalGrid.x == minLocalGrid.x){ maxLocalGrid.x += voxelSize; }
if (maxLocalGrid.y == minLocalGrid.y){ maxLocalGrid.y += voxelSize; }
if (maxLocalGrid.z == minLocalGrid.z){ maxLocalGrid.z += voxelSize; }
//printf("i = %d. maxLocalGrid = %f, %f, %f.\n", i, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
//printf("i = %d\n v0 = %f, %f, %f\n v1 = %f, %f, %f\n v2 = %f, %f, %f\n minLocalGrid = %f, %f, %f\n===============\n",
// i, v0.x, v0.y, v0.z, v1.x, v1.y, v1.z, v2.x, v2.y, v2.z, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
float j = minLocalGrid.z;
while(j < maxLocalGrid.z){
float k = minLocalGrid.y;
while(k < maxLocalGrid.y){
float l = minLocalGrid.x;
while (l < maxLocalGrid.x){
float3 firstVertexOfVoxel = (float3)(l, k, j);
//printf("l,k,j: %f, %f, %f\n", l, k, j);
float3 globalCoordOffset = (firstVertexOfVoxel - minBoundsGrid) / voxelSize;
int3 globalDimOffset = convert_int3_rtz(globalCoordOffset);
//printf("i = %d. globalCoordOffset: %f, %f, %f\n", i, globalCoordOffset.x, globalCoordOffset.y, globalCoordOffset.z);
//printf("i = %d. globalDimOffset: %d, %d, %d\n", i, globalDimOffset.x, globalDimOffset.y, globalDimOffset.z);
int voxelIndexGlobalGrid = globalDimOffset.x + dimGrid.x * (globalDimOffset.y +
dimGrid.y * globalDimOffset.z);
//printf("i = %d. voxelIndexGlobalGrid = %d\n", i, voxelIndexGlobalGrid);
if (density[voxelIndexGlobalGrid] != 1){
/*The famous 13-axes test*/
float3 axis;
float testResult = 0;
int overlapCount = 0;
//1
testResult = tester(e0, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//2
testResult = tester(e1, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//3
testResult = tester(e2, firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//4
//axis = ;
testResult = tester(cross(-f2, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//5
/*axis = cross(e0, f0);*/
testResult = tester(cross(e0, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//6
//axis = cross(e0, f0);
testResult = tester(cross(e0, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//7
//axis = cross(e0, f0);
testResult = tester(cross(e0, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//8
//axis = cross(e1, f0);
testResult = tester(cross(e1, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//9
//axis = cross(e1, f1);
testResult = tester(cross(e1, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//10
//axis = cross(e1, f2);
testResult = tester(cross(e1, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//11
//axis = cross(e2, f0);
testResult = tester(cross(e2, f0), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//12
//axis = cross(e2, f1);
testResult = tester(cross(e2, f1), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
//13
//axis = cross(e2, f2);
testResult = tester(cross(e2, f2), firstVertexOfVoxel, v0, v1, v2);
if (testResult <= 0){
overlapCount++;
}
if (overlapCount == 13){
density[voxelIndexGlobalGrid] = 1;
}
}
l = l + voxelSize;
}// while for l
k = k + voxelSize;
}// while for k
j = j + voxelSize;
}//while for j
//printf("Here are the max of the %d-th face: %f, %f, %f\n", i / 9, maxLocalGrid.x, maxLocalGrid.y, maxLocalGrid.z);
//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e1.x, e1.y, e1.z);
//printf("Here are the coordinates of the %d-th face: %f, %f, %f\n", i / 9, e2.x, e2.y, e2.z);
//printf("\n==================KERNEL COMPUTED==================\n");
//barrier(CLK_LOCAL_MEM_FENCE);
}
}
}
这是c代码:
#define DEVICE_SELECTOR 1 //0 for Intel and 1 for Nvidia in my computer
#define _CRT_SECURE_NO_WARNINGS
#define KERNEL_FILE "..\\voxelizerKernel.cl"
#define WORK_DIM 1
#define VOXEL_SIZE 0.5f
#define HALF_VOXEL_SIZE VOXEL_SIZE/2.0f;
//C header files
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <algorithm>
//OpenCL header files
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
cl_device_id create_device() {
cl_platform_id *platform;
cl_device_id dev;
cl_uint num_platform;
int err;
/* Identify a platform */
err = clGetPlatformIDs(0, NULL, &num_platform);
if (err < 0) {
printf("Error code: %d. Couldn't identify a platform\n", err);
exit(1);
}
platform = (cl_platform_id*)malloc(sizeof(cl_platform_id)*num_platform);
clGetPlatformIDs(num_platform, platform, NULL);
/* Access a device */
err = clGetDeviceIDs(platform[DEVICE_SELECTOR], CL_DEVICE_TYPE_GPU, 1, &dev, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't access any devices\n", err);
exit(1);
}
return dev;
}
cl_program build_program(cl_context ctx, cl_device_id dev, const char* filename) {
cl_program program;
FILE *program_handle;
char *program_buffer, *program_log;
size_t program_size, log_size;
int err;
/* Read program file and place content into buffer */
program_handle = fopen(filename, "r");
if (program_handle == NULL) {
printf("Couldn't find the program file\n");
exit(1);
}
fseek(program_handle, 0, SEEK_END);
program_size = ftell(program_handle);
rewind(program_handle);
program_buffer = (char*)malloc(program_size + 1);
program_buffer[program_size] = '\0';
fread(program_buffer, sizeof(char), program_size, program_handle);
fclose(program_handle);
/* Create program from file */
program = clCreateProgramWithSource(ctx, 1,
(const char**)&program_buffer, &program_size, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create the program\n", err);
exit(1);
}
free(program_buffer);
/* Build program */
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err < 0) {
/* Find size of log and print to std output */
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(program, dev, CL_PROGRAM_BUILD_LOG,
log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}
return program;
}
void print_device_info(cl_device_id dev){
cl_ulong glob_mem_size, local_mem_size;
cl_uint clock_freq, num_core, work_item_dim, time_res;
size_t local_size, work_item_size[3];
char dev_vendor[40], dev_name[400], driver_version[40], device_version[40];
clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(dev_vendor), &dev_vendor, NULL);
clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(dev_name), &dev_name, NULL);
clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(glob_mem_size), &glob_mem_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(local_mem_size), &local_mem_size, NULL);
clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(driver_version), &driver_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(device_version), &device_version, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_freq), &clock_freq, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_core), &num_core, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(work_item_size), &work_item_size, NULL);
clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(work_item_dim), &work_item_dim, NULL);
clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL);
printf("==========================================================\n");
printf("Device Sepc without consideration of kernels:\n");
printf("CL_DEVICE_VENDOR: %s\n", dev_vendor);
printf("CL_DEVICE_NAME: %s\n", dev_name);
printf("CL_DEVICE_GLOBAL_MEM_SIZE: %I64u GB\n", glob_mem_size / 1073741824);
printf("CL_DEVICE_LOCAL_MEM_SIZE: %I64u KB\n", local_mem_size / 1024);
printf("CL_DRIVER_VERSION: %s\n", driver_version);
printf("CL_DEVICE_VERSION: %s\n", device_version);
printf("CL_DEVICE_MAX_CLOCK_FREQUENCY: %I32u MHz\n", clock_freq);
printf("CL_DEVICE_MAX_COMPUTE_UNITS: %I32u\n", num_core);
printf("CL_DEVICE_MAX_WORK_GROUP_SIZE %u\n", local_size);
printf("CL_DEVICE_MAX_WORK_ITEM_SIZES: {%I32u, %I32u, %I32u}\n", work_item_size[0], work_item_size[1], work_item_size[2]);
printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS: %I32u\n", work_item_dim);
printf("CL_DEVICE_PROFILING_TIMER_RESOLUTION: %I32u ns\n", time_res);
printf("==========================================================\n");
}
int main()
{
/*OpenCL variables*/
cl_int i, j, err, num_groups;
size_t local_size, max_local_size, global_size, processed_global_size;
cl_context context;
cl_command_queue queue;
cl_program program;
cl_device_id device;
cl_kernel voxelization_kernel, reduction_kernel, reduction_complete_kernel;
cl_mem coords_buffer, density_buffer, dimGrid_buffer, h_minBoundsGrid_buffer, fullVxelsCount_buffer, group_sums_buffer;
void *density_mapped_memory;
cl_event prof_event;
cl_ulong time_start, time_end, total_time;
float h_voxelSize = VOXEL_SIZE;
float fullVxelsCount = 0;
/*Read mesh data*/
float coords[54] =
{ 0.300500,
1.300000,
0.000500,
1.200500,
1.600000,
0.000500,
1.600500,
0.600000,
0.000500,
0.300500,
1.300000,
0.000500,
0.500500,
1.900000,
0.000500,
1.200500,
1.600000,
0.000500,
0.300500,
1.300000,
0.000500,
1.600500,
0.600000,
0.000500,
0.100500,
0.700000,
0.000500,
0.100500,
0.700000,
0.000500,
1.600500,
0.600000,
0.000500,
0.000500,
0.200000,
0.000500,
0.000500,
0.200000,
0.000500,
1.600500,
0.600000,
0.000500,
1.600500,
0.100000,
0.000500,
1.200500,
1.600000,
0.000500,
1.600500,
1.300000,
0.000500,
1.600500,
0.600000,
0.000500 };
/*Get the voxel count*/
float boundsGrid[6] = {0,2,0,2,0,0.5};
int dimGrid[3] = {
(boundsGrid[1] - boundsGrid[0]) / VOXEL_SIZE,
(boundsGrid[3] - boundsGrid[2]) / VOXEL_SIZE,
(boundsGrid[5] - boundsGrid[4]) / VOXEL_SIZE
};
if (dimGrid[0] == 0) dimGrid[0] = 1;
if (dimGrid[1] == 0) dimGrid[1] = 1;
if (dimGrid[2] == 0) dimGrid[2] = 1;
float h_minBoundsGrid[3];
h_minBoundsGrid[0] = boundsGrid[0];
h_minBoundsGrid[1] = boundsGrid[2];
h_minBoundsGrid[2] = boundsGrid[4];
int voxelCounts = dimGrid[0] * dimGrid[1] * dimGrid[2];
/*Prepare kernel output : build an array for storing voxles' density info*/
int *density = (int*)malloc(sizeof(int)*voxelCounts);
for (int i = 0; i < voxelCounts; i++){
density[i] = 0;
}
/*OpenCL essentials*/
device = create_device();
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_local_size), &max_local_size, NULL);
//print_device_info(device);
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a context\n", err);
exit(1);
}
program = build_program(context, device, KERNEL_FILE);
queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a command queue\n", err);
exit(1);
};
voxelization_kernel = clCreateKernel(program, "voxelizer", &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a kernel\n", err);
exit(1);
};
int numberOfFaces = 6;
global_size = numberOfFaces;
local_size = max_local_size;
if (global_size % local_size != 0){
processed_global_size = (global_size / local_size + 1) * local_size;
//int padding = processed_global_size - global_size;
//int *working_data = (int*)malloc((voxelCounts + padding)*sizeof(int));
//memcpy(working_data, density, voxelCounts);
//memset(working_data + voxelCounts, 0.0, padding);
}
else{
processed_global_size = global_size;
}
/* Create host-device data exchange interface*/
dimGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, dimGrid, &err);
h_minBoundsGrid_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float)* 3, h_minBoundsGrid, &err);
coords_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(float) * 54, coords, &err);
density_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY |
CL_MEM_COPY_HOST_PTR, sizeof(int) * voxelCounts, density, &err);
if (err < 0) {
printf("Error code: %d. Couldn't create a buffer\n", err);
exit(1);
};
err = clSetKernelArg(voxelization_kernel, 0, sizeof(global_size), &global_size);
err |= clSetKernelArg(voxelization_kernel, 1, sizeof(h_voxelSize), &h_voxelSize);
err |= clSetKernelArg(voxelization_kernel, 2, sizeof(cl_mem), &h_minBoundsGrid_buffer);
err |= clSetKernelArg(voxelization_kernel, 3, sizeof(cl_mem), &dimGrid_buffer);
err |= clSetKernelArg(voxelization_kernel, 4, sizeof(cl_mem), &coords_buffer);
err |= clSetKernelArg(voxelization_kernel, 5, sizeof(cl_mem), &density_buffer);
if (err < 0) {
printf("Error code: %d. Couldn't create an argument for voxelization_kernel\n", err);
exit(1);
}
/* Do the voxelization magic */
err = clEnqueueNDRangeKernel(queue, voxelization_kernel, 1, NULL, &processed_global_size,
&local_size, 0, NULL, &prof_event);
if (err < 0) {
printf("Error code: %d. Couldn't enqueue the voxelization_kernel\n", err);
exit(1);
}
/* Read the results */
density_mapped_memory = clEnqueueMapBuffer(queue, density_buffer, CL_TRUE,
CL_MAP_READ, 0, sizeof(density), 0, NULL, NULL, &err);
if (err < 0) {
printf("Error code : %d. Couldn't map the buffer to host memory\n", err);
exit(1);
}
memcpy(density, density_mapped_memory, sizeof(density)* voxelCounts);
err = clEnqueueUnmapMemObject(queue, density_buffer, density_mapped_memory,
0, NULL, NULL);
if (err < 0) {
printf("Error code: %d. Couldn't unmap the density_buffer\n", err);
exit(1);
}
for (int i = 0; i < voxelCounts; i++){
printf("%d\n", density[i]);
}
/*Clean up*/
clReleaseKernel(voxelization_kernel);
clReleaseMemObject(dimGrid_buffer);
clReleaseMemObject(h_minBoundsGrid_buffer);
clReleaseMemObject(coords_buffer);
clReleaseMemObject(density_buffer);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return 0;
}