我觉得我不了解基本的并行编程概念。下面的内核是一个简单/人为的例子,可以重现我遇到的问题。它试图使用" points"中的所有值。计算一个值并将其分配给"块中的所有项目。"我想推动这些数组大小的限制。虽然我可以制作"块"数组退出大(> 1亿浮点数),我得到一个"无效的命令队列" "积分"填充了超过~10万个浮点数(在clEnqueueNDRangeKernel之后立即调用clFinish)。你们中的任何人能帮助我理解为什么吗?
__kernel void openClTesting (__global float *blocks, __global float *points, int pointsCount)
{
int globalId = get_global_id(0);
int count = 0;
for (int i = 0; i < pointsCount; i++)
{
count++;
}
blocks[globalId] = count;
};
部分设备信息:
CL_DEVICE_LOCAL_MEM_SIZE = 49,152
CL_DEVICE_GLOBAL_MEM_SIZE = 2,147,483,648
CL_DEVICE_MAX_MEM_ALLOC_SIZE = 536,870,912
主机代码:
#include "stdafx.h"
#include "CL\opencl.h"
#include <iostream>
#include <fstream>
#include <string>
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#define NUM_POINTS 100000
#define NUM_BLOCKS 100000000
struct openClData
{
cl_device_id deviceId = NULL;
cl_uint numDevices;
cl_uint numPlatforms;
cl_int ret;
cl_platform_id *platforms = NULL;
cl_context context;
cl_command_queue commandQueue;
cl_program program;
cl_kernel kernel;
char* kernelCode;
cl_uint kernelCodeSize;
size_t globalItemSize;
size_t localItemSize = 1;
};
char* getKernelCode();
void printErrorLog(openClData oclData);
void printRet(openClData oclData, int line);
int countFileChars(const char *fileName);
int _tmain(int argc, _TCHAR* argv[])
{
openClData oclData;
oclData.globalItemSize = NUM_POINTS;
oclData.kernelCode = getKernelCode();
std::cout << oclData.kernelCode << std::endl;
oclData.kernelCodeSize = strlen(oclData.kernelCode);
int numPoints = NUM_POINTS;
int numBlocks = NUM_BLOCKS;
cl_long localMemSize = 0, globalMemSize = 0, maxAllocMemSize = 0;
float *blocks = new float[numBlocks]{0};
float *points = new float[numPoints]{0};
//prepare platform, device, context and command queue
oclData.ret = clGetPlatformIDs(0, NULL, &oclData.numPlatforms);
printRet(oclData, __LINE__);
oclData.platforms = (cl_platform_id *)malloc(oclData.numPlatforms * sizeof(cl_platform_id));
oclData.ret = clGetPlatformIDs(oclData.numPlatforms, oclData.platforms, NULL);
printRet(oclData, __LINE__);
oclData.ret = clGetDeviceIDs(oclData.platforms[0], CL_DEVICE_TYPE_GPU, 1, &oclData.deviceId, &oclData.numDevices);
printRet(oclData, __LINE__);
oclData.context = clCreateContext(NULL, 1, &oclData.deviceId, NULL, NULL, &oclData.ret);
printRet(oclData, __LINE__);
oclData.commandQueue = clCreateCommandQueue(oclData.context, oclData.deviceId, 0, &oclData.ret);
printRet(oclData, __LINE__);
//prepare cl_mem objects
cl_mem memObjBlocks = clCreateBuffer(oclData.context, CL_MEM_READ_WRITE, sizeof(float) * numBlocks, NULL, &oclData.ret);
printRet(oclData, __LINE__);
cl_mem memObjPoints = clCreateBuffer(oclData.context, CL_MEM_READ_WRITE, sizeof(float) * numPoints, NULL, &oclData.ret);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueWriteBuffer(oclData.commandQueue, memObjBlocks, CL_TRUE, 0, sizeof(float) * numBlocks, blocks, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueWriteBuffer(oclData.commandQueue, memObjPoints, CL_TRUE, 0, sizeof(float) * numPoints, points, 0, NULL, NULL);
printRet(oclData, __LINE__);
//prepare program
oclData.program = clCreateProgramWithSource(oclData.context, 1, (const char**)&oclData.kernelCode, (const size_t *)&oclData.kernelCodeSize, &oclData.ret);
printRet(oclData, __LINE__);
oclData.ret = clBuildProgram(oclData.program, 1, &oclData.deviceId, NULL, NULL, NULL);
printRet(oclData, __LINE__);
if (oclData.ret == CL_BUILD_PROGRAM_FAILURE) printErrorLog(oclData);
oclData.kernel = clCreateKernel(oclData.program, "openClTesting", &oclData.ret);
printRet(oclData, __LINE__);
//set arguments
oclData.ret = clSetKernelArg(oclData.kernel, 0, sizeof(cl_mem), &memObjBlocks);
printRet(oclData, __LINE__);
oclData.ret = clSetKernelArg(oclData.kernel, 1, sizeof(cl_mem), &memObjPoints);
printRet(oclData, __LINE__);
oclData.ret = clSetKernelArg(oclData.kernel, 2, sizeof(int), &numPoints);
printRet(oclData, __LINE__);
//run
oclData.ret = clEnqueueNDRangeKernel(oclData.commandQueue, oclData.kernel, 1, NULL, &oclData.globalItemSize, &oclData.localItemSize, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clEnqueueReadBuffer(oclData.commandQueue, memObjBlocks, CL_TRUE, 0, sizeof(float) * numBlocks, blocks, 0, NULL, NULL);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
//print some device info
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemSize, 0);
std::cout << "CL_DEVICE_LOCAL_MEM_SIZE = " << localMemSize << '\n';
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_long), &globalMemSize, 0);
std::cout << "CL_DEVICE_GLOBAL_MEM_SIZE = " << globalMemSize << '\n';
oclData.ret = clGetDeviceInfo(oclData.deviceId, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_long), &maxAllocMemSize, 0);
std::cout << "CL_DEVICE_MAX_MEM_ALLOC_SIZE = " << maxAllocMemSize << '\n';
//clean up
oclData.ret = clFlush(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clFinish(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clReleaseKernel(oclData.kernel);
printRet(oclData, __LINE__);
oclData.ret = clReleaseProgram(oclData.program);
printRet(oclData, __LINE__);
oclData.ret = clReleaseMemObject(memObjBlocks);
printRet(oclData, __LINE__);
oclData.ret = clReleaseMemObject(memObjPoints);
printRet(oclData, __LINE__);
oclData.ret = clReleaseCommandQueue(oclData.commandQueue);
printRet(oclData, __LINE__);
oclData.ret = clReleaseContext(oclData.context);
printRet(oclData, __LINE__);
for (size_t i = 0; i < 10; i++)
{
std::cout << blocks[i] << std::endl;
}
delete blocks;
delete points;
return 0;
}
char* getKernelCode()
{
char* kernelCode =
"__kernel void openClTesting (__global float *blocks, __global float *points, int pointsCount)"
"{"
" int globalId = get_global_id(0);"
" int count = 0;"
" for (int i = 0; i < pointsCount; i++)"
" {"
" count++;"
" }"
"blocks[globalId] = count;"
"}";
return kernelCode;
}
void printErrorLog(openClData oclData)
{
size_t log_size;
clGetProgramBuildInfo(oclData.program, oclData.deviceId, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *)malloc(log_size);
clGetProgramBuildInfo(oclData.program, oclData.deviceId, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
std::cout << log;
free(log);
}
void printRet(openClData oclData, int line)
{
std::cout << line << ", " << oclData.ret << std::endl;
}
int countFileChars(const char *fileName)
{
std::ifstream ifs(fileName);
ifs.seekg(0, std::ios_base::end);
size_t count = ifs.tellg();
ifs.seekg(0, std::ios_base::beg);
return count;
}
答案 0 :(得分:0)
我注意到的一些事情:
NUM_POINTS
个工作项,但会将每个项的结果写入blocks[globalId]
- 其中包含NUM_BLOCKS
项。因此,当NUM_POINTS大于NUM_BLOCKS时,这是未定义的行为。它还解释了为什么不同的NUM_BLOCKS什么都不做(超出上述限制):除了内存分配外,NUM_BLOCKS的值无效。 (并且您发现的内存分配限制大致与您的实现的CL_DEVICE_MAX_MEM_ALLOC_SIZE值相匹配。)答案 1 :(得分:0)
总的来说,应该避免使用localItemSize = 1;
,因为它会强制每个OpenCL工作组由一个工作项组成,这会降低与计算设备可以并行运行的工作组数量的并行度,这将远远少于它可以运行的工作项数。您可以简单地将NULL
传递给本地项目大小,而不是让OpenCL实现在其自身中找出合理的值:
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, NULL, 0, NULL, NULL);
这也可能是您的错误来源,因为您正在创建NUM_POINTS
工作组,但设备上队列的大小受内存限制(CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE
)。