Question

我是cuda编程的初学者。我尝试使用自己的简单代码，但它无法正常工作，我也不知道还能做些什么。

我的代码：

#include <mpi.h>
#include <cuda.h>
#include <stdio.h>
#include <sys/wait.h>
// Prototypes
__global__ void helloWorld(char*);
__device__ int  getGlobalIdx_2D_2D();

// Host function

int main(int argc, char** argv)
{
    unsigned int i, N, gridX, gridY, blockX, blockY;
    N = 4096000;

    char *str = (char *) malloc(N*sizeof(char));
    for(i=0; i < N; i++) str[i]='c';

    MPI_Init (&argc, &argv);

    char *d_str;
    size_t size = (size_t) N*sizeof(char);
    cudaMalloc((void**)&d_str, size);
    cudaMemcpy(d_str, str, size, cudaMemcpyHostToDevice);

    gridX = 100;
    gridY = 10;
    blockX = blockY = 64;
    dim3 dimGrid(gridX, gridY);  // 4096 chars per block
    dim3 dimBlock(blockX, blockY); // one thread per character, 2D
    printf("dimGrid(%d, %d)\t", gridX, gridY);
    printf("dimBlock(%d, %d)\t", blockX, blockY);
    helloWorld<<< dimGrid, dimBlock >>>(d_str);

    cudaMemcpy(str, d_str, size, cudaMemcpyDeviceToHost);
    cudaThreadSynchronize();

    MPI_Barrier (MPI_COMM_WORLD);

    cudaFree(d_str);

    printf("\nRes:\n");
    for(i = 0; i < N; i++) printf("\t[%u] %c\n", i, str[i]);

    MPI_Finalize ();

    free(str);
    return 0.0;
}

// Device kernel
__global__ void helloWorld(char* str)
{
    // determine where in the thread grid we are
    int pos = getGlobalIdx_2D_2D();
    if (pos % 2 == 0) str[pos] -= 2;
    else str[pos] += 8;
}

__device__ int getGlobalIdx_2D_2D()
{
    int blockId = blockIdx.x + blockIdx.y * gridDim.x;
    int threadId = blockId * (blockDim.x * blockDim.y) +
                     (threadIdx.y * blockDim.x) + threadIdx.x;
    return threadId;
}

我想要的输出是：jajajajajajaja ... x4096000

我已经读过“＆＃39;％＆＃39;操作效率不高，但我认为不存在问题。

谢谢！

Answer 1

你绝对没有CUDA error checking，这样做真的很有帮助。启用它后，您会发现64 x 64的块大小无效，因为它导致一个块中的4096个线程，这不是有效的配置。

Cuda程序不起作用

1 个答案: