Question

在我的代码中我创建了一个主变量

h4_in = (double*)calloc(2 * countlog, sizeof(double));
h4_out = (double*)calloc(23 * countlog, sizeof(double));

coountlog是一个变量，它基本上表示二维数组的行大小（我将其作为一维数组实现）

//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
    h4_in[count * 2 + 0] = prc[count];
    h4_in[count * 2 + 1] = h_stat1out[count * 6];
}

这是我如何在主程序中调用CUDA

//free cuda memory from previous call
cudaFree(d3_in);
cudaFree(d3_out);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "cudaDeviceReset Failed :%s\n", cudaGetErrorString(cudaStatus));
}
//send data to host in for stat2 calculations
for (int count = 0; count < countlog; count++)
{
    h4_in[count * 2 + 0] = prc[count];
    h4_in[count * 2 + 1] = h_stat1out[count * 6];
}
//Query device to get parameters
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &threadsPerBlock, calcstats2, 0, countlog);
// Round up according to array size 
blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;
//allocate memory on gpu
cudaStatus = cudaMalloc((void **)&d4_in, 2 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_in :%s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaMalloc((void **)&d4_out, 23 * countlog * sizeof(double));
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "CudaMalloc failed in kernel calcstats2 for d_out :%s\n", cudaGetErrorString(cudaStatus));
}
//transfer array to gpu
cudaStatus = cudaMemcpy(d4_in, h4_in, 2 * countlog * sizeof(double), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 host2device :%s\n", cudaGetErrorString(cudaStatus));
}
//launch threads
calcstats2 <<<blocksPerGrid, threadsPerBlock>>>(d4_out, d4_in, countlog);
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "calcstats2 kernel failed: %s\n", cudaGetErrorString(cudaStatus));
}
cudaStatus = cudaDeviceSynchronize();
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Device Sync failed: %s\n", cudaGetErrorString(cudaStatus));
}
//transfer data back to host
cudaStatus = cudaMemcpy(h4_out, d4_out, 23 * countlog * sizeof(double), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
    fprintf(stderr, "Cudamemcopy failed in kernel calcstats2 device2host :%s\n", cudaGetErrorString(cudaStatus));
}
//free cuda
cudaFree(d4_in);
cudaFree(d4_out);

内核调用如下

__global__ void calcstats2(double *d4_out, double *d4_in, int size)
{
    int idx = blockDim.x*blockIdx.x + threadIdx.x;
    double X, A, B, C, D, BX, BA, BB, BC;
    if (idx < 4)
    {
        d4_out[idx * 23 + 0] = -1;
        d4_out[idx * 23 + 1] = -1;
        d4_out[idx * 23 + 2] = -1;
        d4_out[idx * 23 + 3] = -1;
        d4_out[idx * 23 + 4] = -1;
        d4_out[idx * 23 + 5] = -1;
        d4_out[idx * 23 + 6] = -1;
        d4_out[idx * 23 + 7] = -1;
        d4_out[idx * 23 + 8] = -1;
        d4_out[idx * 23 + 9] = -1;
        d4_out[idx * 23 + 10] = -1;
        d4_out[idx * 23 + 11] = -1;
        d4_out[idx * 23 + 12] = -1;
        d4_out[idx * 23 + 13] = -1;
        d4_out[idx * 23 + 14] = -1;
        d4_out[idx * 23 + 15] = -1;
        d4_out[idx * 23 + 16] = -1;
        d4_out[idx * 23 + 17] = -1;
        d4_out[idx * 23 + 18] = -1;
        d4_out[idx * 23 + 19] = -1;
        d4_out[idx * 23 + 20] = -1;
        d4_out[idx * 23 + 21] = -1;
        d4_out[idx * 23 + 22] = -1;
    }
    else
    {
        X = d4_in[idx * 2 - 8];
        A = d4_in[idx * 2 - 6];
        B = d4_in[idx * 2 - 4];
        C = d4_in[idx * 2 - 2];
        D = d4_in[idx * 2 - 0];
        BX = d4_in[idx * 2 - 5];
        BA = d4_in[idx * 2 - 3];
        BB = d4_in[idx * 2 - 1];
        BC = d4_in[idx * 2 + 1];
        //start the stats calcs here
        d4_out[idx * 23 + 0] = fabs(X - D) / fabs(A - X);
        d4_out[idx * 23 + 1] = fabs(A - D) / fabs(A - X);
        d4_out[idx * 23 + 2] = fabs(B - D) / fabs(C - B);
        d4_out[idx * 23 + 3] = fabs(C - D) / fabs(C - B);
        d4_out[idx * 23 + 4] = fabs(B - D) / fabs(A - B);
        d4_out[idx * 23 + 5] = fabs(A - D) / fabs(A - B);
        d4_out[idx * 23 + 6] = fabs(X - C) / fabs(A - X);
        d4_out[idx * 23 + 7] = fabs(A - C) / fabs(A - X);
        d4_out[idx * 23 + 8] = fabs(C - B) / fabs(A - B);
        d4_out[idx * 23 + 9] = fabs(A - B) / fabs(A - X);
        d4_out[idx * 23 + 10] = fabs(C - D) / fabs(A - B);
        d4_out[idx * 23 + 11] = fabs(C - D) / fabs(A - X);
        d4_out[idx * 23 + 12] = fabs(C - B) / fabs(A - X);
        d4_out[idx * 23 + 13] = BC;
        d4_out[idx * 23 + 14] = BB;
        d4_out[idx * 23 + 15] = BA;
        d4_out[idx * 23 + 16] = BX;
        d4_out[idx * 23 + 17] = BB + BC;
        d4_out[idx * 23 + 18] = BA + BB + BC;
        d4_out[idx * 23 + 19] = BX + BA + BB + BC;
        d4_out[idx * 23 + 20] = BA + BB;
        d4_out[idx * 23 + 21] = BX + BA + BB;
        d4_out[idx * 23 + 22] = BX + BA;
    }
}

我在cudamemcppy设备中遇到错误，主机和cudadevicesynchronise遇到非法内存访问。在堆栈溢出帮助后，我更正了我的代码，使其成为一维数组，我已经为主机和设备阵列分配了相同的内存。奇怪的是

此程序在较小的文件上成功运行（输入是OHLC数据）但在较大的文件上出现此错误
即使对于较大的文件，也有3个其他内核调用成功运行，没有任何问题。

非常感谢任何帮助。

先谢谢

阿布舍克巴克

PS我使用的是单个GTX 760卡（华硕供应商：https://www.asus.com/Graphics-Cards/GTX760DC2OC2GD5/），内存为2GB。 cuda版本也是7. IDE是VS 2013。

Answer 1

您（可能）启动了比实际需要更多的线程：

 blocksPerGrid = (countlog + threadsPerBlock - 1) / threadsPerBlock;

你的内核中没有线程检查这个条件。编号高于countlog的线程将访问您的阵列越界。

尝试将内核中的else语句更改为：

else if (idx < size)

Cuda错误设备同步和cudamemcopy中引用的非法内存

1 个答案: