Cuda没有将大型阵列复制到设备上

时间:2014-08-25 06:25:47

标签: arrays cuda nvcc

我是CUDA的新手,所以如果我犯了任何愚蠢的错误,我很抱歉,但这对我来说似乎很困惑。以下代码适用于最大为620个元素的数组。当我们从621开始改变NV def(涡流数量)时,内核中的所有阵列都变为NAN。我希望有人可以解释一下。

#include <stdio.h>
#include <time.h>
#define NP 20000
#define DT 0.01 
#define NV 620  // Fails if 621 or larger
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__device__ float d_x0[NV];
__device__ float d_y0[NV];
__global__ static void  calc(float *d_x, float *d_y, float Lx, float Ly ){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    float fx, fy, t0, t1, t2, t3, t4, dx, dy, pi = acos(-1.0);
    int j, n;
    if (i<NV) {
        // For array error detection
        if (isnan(d_x0[i])) printf(" dx(%d)!",i);
        if (isnan(d_y0[i])) printf(" dy(%d)!",i);
        if (isnan(d_x[i])) printf(" x(%d)!",i);
        if (isnan(d_y[i])) printf(" y(%d)!",i);
        fx = 0.0;   fy = 0.0;
        for (j = 0 ; j < NV ; j++){ 
            dx = d_x0[i] - d_x0[j];
            dy = d_y0[i] - d_y0[j];
            t0 = 2.0 * dy / Ly;
            t1 = sin(2.0 * pi * dx / Lx);
            t3 = cos(2.0 * pi * dx / Lx);
                for (n = -10 ; n <= 10 ; n++){
                    if (n == 0){
                        if (j != i){
                            t2 = cosh(2.0 * pi * Ly / Lx * (dy / Ly + n));
                            t4 = sinh(2.0 * pi * Ly/Lx * (dy / Ly + n));
                            fx = fx + t1 / (t2 - t3);
                            fy = fy + t4 / (t2 - t3);
                        }
                    }   
                    else{
                        t2 = cosh(2.0 * pi * Ly / Lx * (dy / Ly + n));
                        t4 = sinh(2.0 * pi * Ly/Lx * (dy / Ly + n));
                        fx = fx + t1 / (t2 - t3);
                        fy = fy + t4 / (t2 - t3);                           
                    }
                }
                fy = fy - t0;
        }
        fx = fx * pi / Lx;
        fy = fy * pi / Lx;
        d_x[i] = d_x0[i] + fx * DT;
        d_y[i] = d_y0[i] + fy * DT;
        // Clip box
        if(d_x[i] > Lx)   d_x[i] = d_x[i] - (abs(d_x[i] / Lx) * Lx);
        if(d_x[i] < 0.0)  d_x[i] = d_x[i] + ((abs(d_x[i] / Lx) + 1.0) * Lx);
        if(d_y[i] > Ly)   d_y[i] = d_y[i] - (abs(d_y[i] / Ly) * Ly);
        if(d_y[i] < 0.0)  d_y[i] = d_y[i] + ((abs(d_y[i] / Ly) + 1.0) * Ly);
    }
}
__global__ static void  update(float *d_x, float *d_y ){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i<NV) {
        d_x0[i] = d_x[i];
        d_y0[i] = d_y[i];
    }
}
int main(int argc,char **argv) {
    float Lx, Ly, dv;
    int i, k;
    int size = (NV) * sizeof(float);
    float* x = (float*)malloc(size);
    float* y = (float*)malloc(size);
    float* x0 = (float*)malloc(size);
    float* y0 = (float*)malloc(size);
    dv = 0.12 * 16.0;
    Lx = sqrt(2.0 / 3.0 * sqrt(3.0) * NV / dv); 
    Ly = Lx * sqrt(3.0) / 2.0;
    for(i=0 ; i < NV ; i++){
        x0[i] = Lx * (rand() % 1000)/1000;  
        y0[i] = Ly * (rand() % 1000)/1000;
    }
    // GPU mem management
    float *d_x = NULL, *d_y = NULL;
    cudaMalloc((void**)&d_x, size);
    cudaCheckErrors("cudaMalloc fail 1");
    cudaMalloc((void**)&d_y, size);
    cudaCheckErrors("cudaMalloc fail 2");
    cudaMemcpyToSymbol(d_x0, x0, size);
    cudaCheckErrors("cudaMemcpyToSymbol fail 1");
    cudaMemcpyToSymbol(d_y0, y0, size);
    cudaCheckErrors("cudaMemcpyToSymbol fail 2");
    int threadsPerBlock = 512;
    int blocksPerGrid = (NV + threadsPerBlock - 1) / threadsPerBlock;
    for(k = 0; k < NP ; k++){
        calc<<<blocksPerGrid, threadsPerBlock>>>( d_x, d_y, Lx, Ly);
        cudaCheckErrors("kernel 1 call fail");
        cudaDeviceSynchronize();
        update<<<blocksPerGrid, threadsPerBlock>>>( d_x, d_y);
        cudaCheckErrors("kernel 2 call fail");
        if (k%((NP)/200)==0) {
            cudaMemcpy(x, d_x, size, cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemCopy fail 1");
            cudaMemcpy(y, d_y, size, cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemCopy fail 2");
            printf("(%d%%) ",100*k/NP);
            for(i = 1 ; i <= 5 ; i++) printf(",%5.2f,%5.2f ", x[i], y[i]);
            printf("\n\n");
        }
    }
    cudaMemcpy(x, d_x, size, cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail 1");
    cudaMemcpy(y, d_y, size, cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail 2");
    cudaMemcpyFromSymbol(x0, d_x0, size);
    cudaCheckErrors("cudaMemcpyFromSymbol fail 1");
    cudaMemcpyFromSymbol(y0, d_y0, size);
    cudaCheckErrors("cudaMemcpyFromSymbol fail 2");
    cudaFree(d_x);
    cudaFree(d_y);
    return 0;
}

我尝试更改块和网格结构,使用-arch=sm_35 -arch=sm_30--cudart=shared选项进行编译,甚至将数组从 float 更改为 double ,没有已经工作了。

2 个答案:

答案 0 :(得分:1)

您的代码永远不会初始化d_xd_y数组。

您在设备上为它们分配空间:

float *d_x = NULL, *d_y = NULL;
cudaMalloc((void**)&d_x, size);
cudaCheckErrors("cudaMalloc fail 1");
cudaMalloc((void**)&d_y, size);
cudaCheckErrors("cudaMalloc fail 2");

但你永远不会初始化它们或将任何东西复制给它们。这意味着他们有垃圾。因此,当您调用calc内核时,第一行:

    if (isnan(d_x[i])) printf(" x(%d)!",i);
    if (isnan(d_y[i])) printf(" y(%d)!",i);

总是打印出来。

修复一下,你的一些个人计算在主循环的每次迭代中都会爆炸,包括第一次calc内核调用。只要一次迭代产生一个d_x nan的值,我希望你能看到它会在下一次迭代中传播到你所有的其余值。

要对此进行排序,我建议您使用printf进一步检测代码。我发现以下修改很有用:

#include <stdio.h>
#include <time.h>
#include <assert.h>
#define NP 20000
#define DT 0.01
#define NV 621  // Fails if 621 or larger
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

__device__ float d_x0[NV];
__device__ float d_y0[NV];
__global__ static void  calc(float *d_x, float *d_y, float Lx, float Ly ){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    float fx, fy, t0, t1, t2, t3, t4, dx, dy, pi = acos(-1.0);
    int j, n;
    if (i<NV) {
        // For array error detection
        if (isnan(d_x0[i])) printf(" dx(%d)!",i);
        if (isnan(d_y0[i])) printf(" dy(%d)!",i);
        if (isnan(d_x[i])) printf(" x(%d)!",i);
        if (isnan(d_y[i])) printf(" y(%d)!",i);
        fx = 0.0;   fy = 0.0;
        for (j = 0 ; j < NV ; j++){
            dx = d_x0[i] - d_x0[j];
            dy = d_y0[i] - d_y0[j];
            t0 = 2.0 * dy / Ly;
            t1 = sin(2.0 * pi * dx / Lx);
            t3 = cos(2.0 * pi * dx / Lx);
                for (n = -10 ; n <= 10 ; n++){
                    if (n == 0){
                        if (j != i){
                            t2 = cosh(2.0 * pi * Ly / Lx * (dy / Ly + n));
                            t4 = sinh(2.0 * pi * Ly/Lx * (dy / Ly + n));
                            fx = fx + t1 / (t2 - t3);
            if(isnan(fx)) {printf("!8 %d, %d, %d, %f, %f, %f\n",i, j, n, fx, t2, t3); return;}
                            fy = fy + t4 / (t2 - t3);
                        }
                    }
                    else{
                        t2 = cosh(2.0 * pi * Ly / Lx * (dy / Ly + n));
                        t4 = sinh(2.0 * pi * Ly/Lx * (dy / Ly + n));
                        fx = fx + t1 / (t2 - t3);
                        fy = fy + t4 / (t2 - t3);
                    }
                }
                fy = fy - t0;
        }
        fx = fx * pi / Lx;
        fy = fy * pi / Lx;
        d_x[i] = d_x0[i] + fx * DT;
        d_y[i] = d_y0[i] + fy * DT;
        // Clip box
        if(d_x[i] > Lx)   d_x[i] = d_x[i] - (abs(d_x[i] / Lx) * Lx);
        if(d_x[i] < 0.0)  d_x[i] = d_x[i] + ((abs(d_x[i] / Lx) + 1.0) * Lx);
        if(d_y[i] > Ly)   d_y[i] = d_y[i] - (abs(d_y[i] / Ly) * Ly);
        if(d_y[i] < 0.0)  d_y[i] = d_y[i] + ((abs(d_y[i] / Ly) + 1.0) * Ly);
    }
}
__global__ static void  update(float *d_x, float *d_y ){
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i<NV) {
        if (isnan(d_x[i])) assert(0);
        if (isnan(d_y[i])) assert(0);
        d_x0[i] = d_x[i];
        d_y0[i] = d_y[i];
    }
}
int main(int argc,char **argv) {
    float Lx, Ly, dv;
    int i, k;
    int size = (NV) * sizeof(float);
    float* x = (float*)malloc(size);
    float* y = (float*)malloc(size);
    float* x0 = (float*)malloc(size);
    float* y0 = (float*)malloc(size);
    dv = 0.12 * 16.0;
    Lx = sqrt(2.0 / 3.0 * sqrt(3.0) * NV / dv);
    Ly = Lx * sqrt(3.0) / 2.0;
    printf("Lx = %f, Ly = %f\n", Lx, Ly);
    for(i=0 ; i < NV ; i++){
        x0[i] = Lx * (rand() % 1000)/1000;
        y0[i] = Ly * (rand() % 1000)/1000;
        x[i]  = 1.0f;
        y[i]  = 1.0f;
    }
    printf("x0[0] = %f, y0[0] = %f\n", x0[0], y0[0]);
    // GPU mem management
    float *d_x = NULL, *d_y = NULL;
    cudaMalloc((void**)&d_x, size);
    cudaCheckErrors("cudaMalloc fail 1");
    cudaMalloc((void**)&d_y, size);
    cudaCheckErrors("cudaMalloc fail 2");
    cudaMemcpyToSymbol(d_x0, x0, size);
    cudaCheckErrors("cudaMemcpyToSymbol fail 1");
    cudaMemcpyToSymbol(d_y0, y0, size);
    cudaCheckErrors("cudaMemcpyToSymbol fail 2");
    cudaMemcpy(d_x, x, size, cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail 1");
    cudaMemcpy(d_y, y, size, cudaMemcpyHostToDevice);
    cudaCheckErrors("cudaMemcpy fail 2");
    int threadsPerBlock = 512;
    int blocksPerGrid = (NV + threadsPerBlock - 1) / threadsPerBlock;
    for(k = 0; k < NP ; k++){
        printf("iter %d\n", k);
        calc<<<blocksPerGrid, threadsPerBlock>>>( d_x, d_y, Lx, Ly);
        cudaCheckErrors("kernel 1 call fail");
        cudaDeviceSynchronize();
        update<<<blocksPerGrid, threadsPerBlock>>>( d_x, d_y);
        cudaCheckErrors("kernel 2 call fail");
        if (k%((NP)/200)==0) {
            cudaMemcpy(x, d_x, size, cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemCopy fail 1");
            cudaMemcpy(y, d_y, size, cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemCopy fail 2");
            printf("(%d%%) ",100*k/NP);
            for(i = 1 ; i <= 5 ; i++) printf(",%5.2f,%5.2f ", x[i], y[i]);
            printf("\n\n");
        }
    }
    cudaMemcpy(x, d_x, size, cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail 1");
    cudaMemcpy(y, d_y, size, cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy fail 2");
    cudaMemcpyFromSymbol(x0, d_x0, size);
    cudaCheckErrors("cudaMemcpyFromSymbol fail 1");
    cudaMemcpyFromSymbol(y0, d_y0, size);
    cudaCheckErrors("cudaMemcpyFromSymbol fail 2");
    cudaFree(d_x);
    cudaFree(d_y);
    return 0;
}

这些告诉我,对于元素86和518,下面的计算正在爆炸,因为t2 = t3 = 1.0:

                            fx = fx + t1 / (t2 - t3);

希望你能从那里开始工作。我发现你的随机化方案为x0y0产生了许多重复值:

for(i=0 ; i < NV ; i++){
    x0[i] = Lx * (rand() % 1000)/1000;  
    y0[i] = Ly * (rand() % 1000)/1000;
}

这些重复值在此处导致值0:

        dx = d_x0[i] - d_x0[j];

这里cos(0)= 1.0:

        t3 = cos(2.0 * pi * dx / Lx);

对于ij的一些值,您在这里也得到1:

                        t2 = cosh(2.0 * pi * Ly / Lx * (dy / Ly + n));

这会导致t2-t3 = 0,事情就会爆发。

我认为这些都不是CUDA特有的。我相信这个代码也应该在使用嵌套循环的普通主机代码中爆炸。我认为增加NV会加剧问题,因为d_x0d_y0中有更多重复项。

答案 1 :(得分:0)

正如Robert Crovella所指出的,随机数发生器总是生成相同的序列,第一个重复坐标出现在阵列位置621(在窗口上),这导致观察到无限爆炸。问题解决了重新安装生成器并将以下代码添加到原始程序以检查叠加:

for(i=1 ; i <= NV ; i++){
    do {
        test=false; 
        x0[i] = Lx * (rand() % 1000)/1000;  
        y0[i] = Ly * (rand() % 1000)/1000;
        x[i]=x0[i]; y[0]=y0[i];
        for(j=1 ; j < i ; j++){
            if (i!=j&&x0[i]==x0[j]&&y0[i]==y0[j]) {
                test=true; 
                printf("(%d)superposto.\n",i);
            }
        }
    } while (test);
    printf("%f, %f\n", x0[i], y0[i]);
}