Question

我的第一个CUDA应用程序出了问题。基本上它应该生成N个线性微分方程并使用一阶方法在数值上求解它们。变量t（时间）从0到T迭代，步长= TAU = 0.0001。如果T足够小（比如0.001），一切都还可以，但如果T == 0.1或更大，那么内核似乎没有做任何事情。我该如何看待这种情况？

N - 方程式数，TAU - 时间步长，TN - 每个块的线程数，T - 结束时间

变量r没有做任何事情。我用它来验证内核是否做了什么。因此，如果T == 0.0001，则r == 283，但如果T == 0.1，r == 0。

#include <stdio.h>
#include <math.h>
#include <time.h>

#define N 4096
#define TAU 0.0001f
#define TN 2
#define T 0.1f
#define PI 3.141592f

__global__ void kern(float* v, float* m, float* r)
{
    *r = 283;
    __syncthreads();
    int tid = blockIdx.x*TN + threadIdx.x;

    for(float t = 0; t <= T; t += TAU)
    {
        float f = 0;
        __syncthreads();
        for(int k = 0; k < N; ++k)
            f += m[N*tid + k]*v[k];
        f *= TAU;
        f += v[tid];
        __syncthreads();
        v[tid] = f;
    }
}

int main()
{
    float* v = new float[N];
    float* m = new float[N*N];

    for(int i = 0; i < N; ++i)
        v[i] = sin(2*PI*i/N); //setting initial conditions

    for(int i = 0; i < N*N; ++i)
        m[i] = cos(2*PI*i/(N*N)); //coefficients in right hand part of the equations

    // printing some of the values (total: 8 values) to compare with result
    for(int i = 0; i < N*N; i += N*N / 8) printf("%f ", m[i]); printf("\n\n");
    for(int i = 0; i < N; i += N / 8) printf("%f ", v[i]); printf("\n");

    float* cv;
    float* cm;
    float* cr;
    cudaMalloc((void**)&cv, N*sizeof(float));
    cudaMalloc((void**)&cm, N*N*sizeof(float));
    cudaMalloc((void**)&cr, sizeof(float));

    cudaMemcpy(cv, v, N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(cm, m, N*N*sizeof(float), cudaMemcpyHostToDevice);

    dim3 blocks(N / TN);
    dim3 threads(TN);

    time_t ts = time(0);
    printf("starting kernel\n");
    kern<<<blocks, threads>>>(cv, cm, cr);
    printf("kernel stopped\n");
    time_t ts_end = time(0);

    cudaMemcpy(v, cv, N*sizeof(float), cudaMemcpyDeviceToHost);
    float r;
    cudaMemcpy(&r, cr, sizeof(float), cudaMemcpyDeviceToHost);

    for(int i = 0; i < N; i += N / 8) printf("%f ", v[i]); printf("\n");
    printf("%d\n", ts_end - ts);
    printf("result: %f\n", r);


    delete[] m;
    delete[] v;

    cudaFree(cv);
    cudaFree(cm);
    cudaFree(cr);
}

Answer 1

在调用cuda api调用时，请务必检查返回码。

由于某些api错误，很可能你的程序永远不会被运行。

Answer 2

“内核停止”是什么意思？你的意思是第62行的打印输出吗？

请注意，内核启动是异步的 - 也就是说，第61行不等待内核完成。您应该在内核启动后使用“cudaDeviceSynchronize”来等待内核完成。请注意，cudaMemcpy也将在内核启动时同步。

内核无故停止

2 个答案: