我尝试运行以下代码(这与我开始时的内容有所提炼,但仍然会导致错误):
#include "randNorm.h"
#include <OpenCL/opencl.h>
#include <stdio.h>
#include <string.h>
#include "gpu_comp.h"
cl_program compute;
void setup(void) {
cl_int error;
char *src_full =
"__kernel void prods(const size_t d,\n"
" const size_t n,\n"
" __global const double *v,\n"
" __global const double *p,\n"
" __global double *o) {\n"
" size_t x = get_global_id(0), y = get_global_id(1), z = get_global_id(3);\
\n"
" o[(x * n + y) * d + z] = v[x * d + z] * p[y * d + z];\n"
"}\n";
size_t len = strlen(src_full);
compute = clCreateProgramWithSource(gpu_context, 1, (const char **)&src_full,
&len, &error);
if(error != CL_SUCCESS) {
fprintf(stderr, "Error loading OpenCL code.\n");
exit(1);
}
if(clBuildProgram(compute, 0, NULL, "", NULL, NULL) != CL_SUCCESS) {
fprintf(stderr, "Error building program.\n");
exit(1);
}
}
void genRand(size_t n, size_t d, double *points) {
for(size_t i = 0; i < n * d; i++)
points[i] = rand_norm();
}
int main(void) {
size_t m = 10, n = 70, d = 80;
double *vh = malloc(sizeof(double) * m * d);
double *ph = malloc(sizeof(double) * n * d);
double *oh = malloc(sizeof(double) * m * n * d);
for(size_t x = 0; x < m; x++)
for(size_t y = 0; y < n; y++)
for(size_t z = 0; z < n; z++)
oh[(x * n + y) * d + z] = vh[x * d + z] * ph[y * d + z];
fprintf(stderr, "That worked.\n");
gpu_init();
setup();
cl_command_queue q = clCreateCommandQueue(gpu_context, the_gpu, 0, NULL);
genRand(m, d, vh);
genRand(m, d, ph);
cl_mem v = clCreateBuffer(gpu_context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR |
CL_MEM_HOST_NO_ACCESS, sizeof(double) * m * d,
(void *)vh, NULL);
cl_mem p = clCreateBuffer(gpu_context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR |
CL_MEM_HOST_NO_ACCESS, sizeof(double) * n * d,
(void *)ph, NULL);
cl_mem o = clCreateBuffer(gpu_context,
CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS,
sizeof(double) * m * n * d, NULL, NULL);
cl_kernel prods = clCreateKernel(compute, "prods", NULL);
clSetKernelArg(prods, 0, sizeof(d), &d);
clSetKernelArg(prods, 1, sizeof(n), &n);
clSetKernelArg(prods, 2, sizeof(v), &v);
clSetKernelArg(prods, 3, sizeof(p), &p);
clSetKernelArg(prods, 4, sizeof(o), &o);
size_t foo[3] = {m, n, d};
clEnqueueNDRangeKernel(q, prods, 3, NULL, foo, NULL, 0, NULL, NULL);
fprintf(stderr, "Fine to here.\n");
clFlush(q);
fprintf(stderr, "And here.\n");
clFinish(q);
fprintf(stderr, "But segfaults before here.\n");
}
由于某些原因,它会打印出来: 那行得通。 到这里很好。 和这里。 细分错误:11
问题是,为什么此段错误? 对于我来说,绝对没有任何意义,为什么当CPU以C而不是以OpenCL的形式运行完全相同的代码时,它仍可以正常工作。 当GPU将其作为OpenCL运行时,它甚至可以工作(在用unsigned long替换double并编辑gpu finder以确保它在其上运行之后),但是在CPU上存在段错误。 WTF?
编辑:
弄清楚了,问题在于我在打电话给get_global_id(3)
,这是错误的。