目前正在使用OpenCL进行2D阵列实现。在大多数情况下,当matrix_size小于15或更小时,这一切都很好。当我将它增加到类似100的程序时程序崩溃。根据visual studio调试器,问题似乎是整数除以0.我不太确定可能发生的位置。 我的假设:这是工作项和工作组的问题:
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(matrix_size*matrix_size), cl::NullRange);
不幸的是,我不太确定如何解决这个问题。最终,我希望能够在相对较大的数据集上运行这些基本计算。
int main() {
` srand((unsigned int)time(NULL));
const int matrix_size = 100;
string input;
string func;
string input_file;
cout << "Please enter a arithmetic option: multi or add" << endl;
cout << ">> ";
input_file = "MatrixArithmetic.cl";
getline(cin, input);
if (input[0] == 'a') {func = "matrix_add";}
else if (input[0] == 'm') {func = "matrix_multi";}
else { cout << "Not a valid option... exiting" << endl; return 0; }
ifstream ArithmeticFile(input_file);
string src(istreambuf_iterator<char>(ArithmeticFile), (istreambuf_iterator<char>()));
//prepare platform
vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
auto platform = platforms.front();
//gather device info from platform and store into devices vector
vector<cl::Device> devices;
platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
//chose device for computation
auto device = devices.front();
cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>() << endl;
/*cout << "This: " << CL_DEVICE_MAX_MEM_ALLOC_SIZE << endl;
cout << "This 2: " << CL_DEVICE_MAX_MEM_ALLOC_SIZE << endl;*/
//setup the context
cl::Program::Sources sources;
sources.push_back({ src.c_str(), src.length() });
cl::Context context(device);
cl::Program program(context, sources);
auto err = program.build("-cl-std=CL1.2");
//setup kernel (this is kernel specific)
cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
//build and seed matrix using random for computation this is done on the main processor
float vec1[matrix_size][matrix_size];
float vec2[matrix_size][matrix_size];
for (int x = 0; x < matrix_size; x++) {
for (int y = 0; y < matrix_size; y++) {
vec1[x][y] = (float)(1+rand()%(rand()%1000));
vec2[x][y] = (float)(1+rand()%(rand()%1000));
}
}
//queue setup for pushing commands to device
cl::CommandQueue queue(context, device);
//write vec1 and vec2 to device
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec1);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec2);
////run the kernel
cl::Kernel kernel = cl::Kernel(program, func.c_str());
//pushing argument to kernel it has 3 total arguments
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(matrix_size*matrix_size), cl::NullRange);
queue.finish();
//writing to the buffer
float vec3[matrix_size][matrix_size];
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec3);
cin.get();
return 0;
}
Kernel:
__kernel void matrix_add(__global const float *A, __global float *B,
__global float *C)
{
//index of the current element
int x = get_global_id(0);
//operation
C[x] = A[x] + B[x];
}
__kernel void matrix_multi(__global const float *A, __global const float *B, __global float *C)
{
//OpenCL does not take 2D arrays it had to be flatten
//index of the current element
int x = get_global_id(0);
//operation
C[x] = A[x] * B[x];
}
规格:i5 4690K和AMD r9 290 8GB Ram。所以记忆不应该成为每个问题&#34; Matrix&#34;应该占用大约4000字节,大小为100x100
答案 0 :(得分:0)
罪魁祸首: rand()%1000有时评估为0。
vec1[x][y] = (float)(1+rand()%(1+rand()%1000));
vec2[x][y] = (float)(1+rand()%(1+rand()%1000));
更改为:
vec1[x][y] = (float)(1+rand()%(1+rand()%1000));
vec2[x][y] = (float)(1+rand()%(1+rand()%1000));
答案 1 :(得分:0)
您应该使用包装器将1-D数组(或分配)视为2-D数组:
MDArr buf = new MDArr(100,100); // allocates 10k memory and gets its pointer
buf[10][90]=3; // this is overloaded operator usage for host-side
clEnqueueWriteBuffer(.,..,buf.ptr()) // 0th addreess of contiguous mem
// same as &vec1[0][0]
所以它不能访问非连续的禁区。
编辑:100x100浮点数组必须是4 * 100 * 100 = x86 cpu的40k内存区域