在我的代码中我创建了一个主变量
h4_in = (double*)calloc(2 * countlog, sizeof(double));
h4_out = (double*)calloc(23 * countlog, sizeof(double));
coountlog是一个变量,它基本上表示二维数组的行大小(我将其作为一维数组实现)
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
h4_in[count * 2 + 0] = prc[count];
h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
这是我如何在主程序中调用CUDA
//free cuda memory from previous call
cudaFree(d3_in);
cudaFree(d3_out);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset Failed :%s\n", cudaGetErrorString(cudaStatus));
}
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
h4_in[count * 2 + 0] = prc[count];
h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
//Query device to get parameters
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &threadsPerBlock, calcstats2, 0, countlog);
// Round up according to array size
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
//allocate memory on gpu
cudaStatus = cudaMalloc((void **)&d4_in, 2 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_in :%s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaMalloc((void **)&d4_out, 23 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_out :%s\n", cudaGetErrorString(cudaStatus));
}
//transfer array to gpu
cudaStatus = cudaMemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudaGetErrorString(cudaStatus));
}
//launch threads
calcstats2 <<<blocksPerGrid, threadsPerBlock>>>(d4_out, d4_in, countlog);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "calcstats2 kernel failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Device Sync failed: %s\n", cudaGetErrorString(cudaStatus));
}
//transfer data back to host
cudaStatus = cudaMemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudaGetErrorString(cudaStatus));
}
//free cuda
cudaFree(d4_in);
cudaFree(d4_out);
内核调用如下
__global__ void calcstats2(double *d4_out, double *d4_in, int size)
{
int idx = blockDim.x*blockIdx.x + threadIdx.x;
double X, A, B, C, D, BX, BA, BB, BC;
if (idx < 4)
{
d4_out[idx * 23 + 0] = -1;
d4_out[idx * 23 + 1] = -1;
d4_out[idx * 23 + 2] = -1;
d4_out[idx * 23 + 3] = -1;
d4_out[idx * 23 + 4] = -1;
d4_out[idx * 23 + 5] = -1;
d4_out[idx * 23 + 6] = -1;
d4_out[idx * 23 + 7] = -1;
d4_out[idx * 23 + 8] = -1;
d4_out[idx * 23 + 9] = -1;
d4_out[idx * 23 + 10] = -1;
d4_out[idx * 23 + 11] = -1;
d4_out[idx * 23 + 12] = -1;
d4_out[idx * 23 + 13] = -1;
d4_out[idx * 23 + 14] = -1;
d4_out[idx * 23 + 15] = -1;
d4_out[idx * 23 + 16] = -1;
d4_out[idx * 23 + 17] = -1;
d4_out[idx * 23 + 18] = -1;
d4_out[idx * 23 + 19] = -1;
d4_out[idx * 23 + 20] = -1;
d4_out[idx * 23 + 21] = -1;
d4_out[idx * 23 + 22] = -1;
}
else
{
X = d4_in[idx * 2 - 8];
A = d4_in[idx * 2 - 6];
B = d4_in[idx * 2 - 4];
C = d4_in[idx * 2 - 2];
D = d4_in[idx * 2 - 0];
BX = d4_in[idx * 2 - 5];
BA = d4_in[idx * 2 - 3];
BB = d4_in[idx * 2 - 1];
BC = d4_in[idx * 2 + 1];
//start the stats calcs here
d4_out[idx * 23 + 0] = fabs(X - D) / fabs(A - X);
d4_out[idx * 23 + 1] = fabs(A - D) / fabs(A - X);
d4_out[idx * 23 + 2] = fabs(B - D) / fabs(C - B);
d4_out[idx * 23 + 3] = fabs(C - D) / fabs(C - B);
d4_out[idx * 23 + 4] = fabs(B - D) / fabs(A - B);
d4_out[idx * 23 + 5] = fabs(A - D) / fabs(A - B);
d4_out[idx * 23 + 6] = fabs(X - C) / fabs(A - X);
d4_out[idx * 23 + 7] = fabs(A - C) / fabs(A - X);
d4_out[idx * 23 + 8] = fabs(C - B) / fabs(A - B);
d4_out[idx * 23 + 9] = fabs(A - B) / fabs(A - X);
d4_out[idx * 23 + 10] = fabs(C - D) / fabs(A - B);
d4_out[idx * 23 + 11] = fabs(C - D) / fabs(A - X);
d4_out[idx * 23 + 12] = fabs(C - B) / fabs(A - X);
d4_out[idx * 23 + 13] = BC;
d4_out[idx * 23 + 14] = BB;
d4_out[idx * 23 + 15] = BA;
d4_out[idx * 23 + 16] = BX;
d4_out[idx * 23 + 17] = BB + BC;
d4_out[idx * 23 + 18] = BA + BB + BC;
d4_out[idx * 23 + 19] = BX + BA + BB + BC;
d4_out[idx * 23 + 20] = BA + BB;
d4_out[idx * 23 + 21] = BX + BA + BB;
d4_out[idx * 23 + 22] = BX + BA;
}
}
我在cudamemcppy设备中遇到错误,主机和cudadevicesynchronise遇到非法内存访问。在堆栈溢出帮助后,我更正了我的代码,使其成为一维数组,我已经为主机和设备阵列分配了相同的内存。奇怪的是
此程序在较小的文件上成功运行(输入是OHLC数据)但在较大的文件上出现此错误
即使对于较大的文件,也有3个其他内核调用成功运行,没有任何问题。
非常感谢任何帮助。
先谢谢
阿布舍克巴克
PS我使用的是单个GTX 760卡(华硕供应商:https://www.asus.com/Graphics-Cards/GTX760DC2OC2GD5/),内存为2GB。 cuda版本也是7. IDE是VS 2013。
答案 0 :(得分:1)
您(可能)启动了比实际需要更多的线程:
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
你的内核中没有线程检查这个条件。编号高于countlog
的线程将访问您的阵列越界。
尝试将内核中的else语句更改为:
else if (idx < size)