因此,当我运行我的代码时,它会完美地执行,但是当我尝试在Visual Profiler中运行它时,它第一次运行,但它似乎想要运行程序七次,第二次它导致未指定发射失败。为什么会这样?我的代码如下所示,我的错误检查告诉我发生了错误 cudaMemcpy(p-> siteset,rsites,sitesize,cudaMemcpyDeviceToHost); (通过搜索memcpy11可能最容易在代码中查找,它将是上面的行)
我想不出一个程序在第二次运行时基本上会出错的原因而不是第一次,如果我在终端运行它多次就完全没问题了。任何人都可以想出可能发生的事情吗? 谢谢!
void fillin(node *p, node *left, node *rt)
{
size_t stepsize = chars * sizeof(long);
size_t sitesize = chars * sizeof(sitearray);
seqptr lsites;
cudaMalloc((void **) &lsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &lsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(lsteps, left->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy7");
cudaMemcpy(lsites, left->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy8");
steptr rsteps;
seqptr rsites;
cudaMalloc((void **) &rsteps, stepsize);
checkCUDAError("malloc");
cudaMalloc((void **) &rsites, sitesize);
checkCUDAError("malloc");
cudaMemcpy(rsteps, rt->numsteps, stepsize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy9");
cudaMemcpy(rsites, rt->siteset, sitesize, cudaMemcpyHostToDevice);
checkCUDAError("memcpy");
//call kernel
int block_size = 1;
int n_blocks = chars;
fillinBoth <<<n_blocks, block_size>>> (lsteps, lsites, rsteps, rsites, chars);
cudaMemcpy(p->numsteps, rsteps, stepsize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy10");
cudaMemcpy(p->siteset, rsites, sitesize, cudaMemcpyDeviceToHost);
checkCUDAError("memcpy11");
cudaFree(rsites); cudaFree(rsteps);
cudaFree(lsites); cudaFree(lsteps);
checkCUDAError("free");
}
}
__global__ void fillinBoth (steptr lsteps, seqptr lsite, steptr rsteps, seqptr rsite, long max){
boolean counted;
aas aa;
long s;
long i, j, k, n;
int idx = blockIdx.x;
//reduce array references; may or may not be useful
__shared__ long ls[3];
__shared__ long rs[3];
__shared__ long qs[3];
counted = false;
k = 0;
//computation from original program, but now need to do manual address calculation
if(idx < max){
for(i = 0; i < 3; i++){
rs[i]=rsite[idx][i];
ls[i]=lsite[idx][i];
}
n = lsteps[idx] + rsteps[idx];
counted = false;
for (i = 0; i <= 5; i++) {
if (k < 3) {
switch (i) {
case 0:
s = ls[0] & rs[0];
break;
case 1:
s = (ls[0] & rs[1]) | (ls[1] & rs[0]);
break;
case 2:
s = (ls[0] & rs[2]) | (ls[1] & rs[1]) | (ls[2] & rs[0]);
break;
case 3:
s = ls[0] | (ls[1] & rs[2]) | (ls[2] & rs[1]) | rs[0];
break;
case 4:
s = ls[1] | (ls[2] & rs[2]) | rs[1];
break;
case 5:
s = ls[2] | rs[2];
break;
}
if (counted || s != 0) {
qs[k] = s;
k++;
counted = true;
} else if (!counted)
n += cudaWeight[idx];
}
}
for (i = 0; i <= 1; i++) {
for (aa = ala; (long)aa <= (long)stop; aa = (aas)((long)aa + 1)) {
if (((1L << ((long)aa)) & qs[i]) != 0) {
for (j = i + 1; j <= 2; j++)
qs[j] |= cudaTranslate[(long)aa - (long)ala][j - i];
}
}
}
rsteps[idx] = n;
for(i = 0; i < 3; i++)
rsite[idx][i]=qs[i];
}
}
答案 0 :(得分:1)
尝试禁用配置文件会话设置中的所有计数器。还尝试从工作文件夹中删除所有文件,如“temp_compute_profiler_1_1.csv”(请参阅配置文件设置“工作文件夹”,默认情况下与可执行文件的位置相同)。
存在相同的错误(OpenCL over CUDA):http://www.khronos.org/message_boards/viewtopic.php?t=4324