cuda计划的产出不是预期的

时间:2011-07-03 03:54:46

标签: c++ cuda

#include<cuda_runtime.h>
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>


__global__ void setVal(char **c){

c[(blockIdx.y * gridDim.x) + blockIdx.x] = "hello\0";

}


int main(){

char **gpu = NULL;
cudaMalloc((void**)&gpu, 6 * sizeof(char *));
int i;
/*
I cannot access second level directly
for( i =0 ; i < 6 ;i++){
    cudaMalloc((void**)&gpu[i], 10 * sizeof(char));
}*/


dim3 grid(3,2);
setVal<<<grid, 1>>>(gpu);
char *p = (char*)malloc(10 * sizeof(char));
char *x[6];

cudaMemcpy(x, gpu, 6*sizeof(char*), cudaMemcpyDeviceToHost);
for( i =0 ; i< 6; i++){
    cudaMemcpy(p, x[i], 10*sizeof(char), cudaMemcpyDeviceToHost);
    //put synchronize here if problem
    printf("%s\n",p);

}


getchar();
return 0;
}

基于所有建议,我修改了我的代码以使我的概念正确。但是,代码仍然无效:(。任何帮助将不胜感激

3 个答案:

答案 0 :(得分:3)

试试这个 - 我在CUDA 3.2下的GTX 285上进行了测试 - 所以它比当前版本更具限制性,但它确实有效。

#include<stdio.h>
#include<string.h>

__global__ void setValues(char** word)
{
    volatile char* myWord = word[blockIdx.x];

    myWord[0] = 'H';
    myWord[1] = 'o';
    myWord[2] = 'l';
    myWord[3] = 'a';
    myWord[4] = '\0';
}

int main()
{
    const size_t bufferSize = 32;
    const int nObjects = 10;

    char*  h_x[nObjects];
    char** d_x = 0;

    cudaMalloc( (void**)(&d_x), nObjects * sizeof(char*) );

    for ( int i=0; i < nObjects; i++ )
    {
        h_x[i] = NULL;
        cudaMalloc( (void**)(&h_x[i]), bufferSize * sizeof(char) );
        printf("h_x[%d] = %lx\n",i,(unsigned long)h_x[i]);
    }

    cudaMemcpy( d_x, h_x, nObjects*sizeof(char*), cudaMemcpyHostToDevice);
    printf("Copied h_x[] to d_x[]\n");

    char msg[] = "Hello World!";
    cudaMemcpy( h_x[0], msg, 13*sizeof(char), cudaMemcpyHostToDevice );

    /*  Force Thread Synchronization  */
    cudaError err = cudaThreadSynchronize();

    /*  Check for and display Error  */
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
                __FILE__, __LINE__, cudaGetErrorString( err) );
    }

    setValues<<<nObjects,1>>>(d_x);

    /*  Force Thread Synchronization  */
    err = cudaThreadSynchronize();

    /*  Check for and display Error  */
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
                __FILE__, __LINE__, cudaGetErrorString( err) );
    }

    printf("Kernel Completed Successfully.  Woot.\n\n");

    char p[bufferSize];

    printf("d_x = %lx\n", (unsigned long)d_x );
    printf("h_x = %lx\n", (unsigned long)h_x );

    cudaMemcpy( h_x, d_x, nObjects*sizeof(char*), cudaMemcpyDeviceToHost);

    printf("d_x = %lx\n", (unsigned long)d_x );
    printf("h_x = %lx\n", (unsigned long)h_x );

    for ( int i=0; i < nObjects; i++ )
    {
        cudaMemcpy( &p, h_x[i], bufferSize*sizeof(char), cudaMemcpyDeviceToHost);
        printf("%d p[] = %s\n",i,p);
    }

    /*  Force Thread Synchronization  */
    err = cudaThreadSynchronize();

    /*  Check for and display Error  */
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
                __FILE__, __LINE__, cudaGetErrorString( err) );
    }

    getchar();

    return 0;
}

正如@Jon所说,你不能将x(如你所声明的那样)传递给GPU,因为它是一个存在于 CPU 上的地址。在上面的代码中,我创建了一个char *数组,并将它们传递给我在GPU上分配的char **。希望这有帮助!

答案 1 :(得分:2)

您的代码的主要问题是您没有为setValues调用分配任何设备内存。你不能传递一个指向主机内存的指针(char * x [6])并期望它能够工作; CUDA内核必须在CUDA内存上运行。您创建该内存,然后对其进行操作,然后将其复制回来:

#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void setValues(char *arr){
    arr[blockIdx.y * gridDim.x + blockIdx.x] = '4';
}

int main() {
    const int NCHARS=6;
    char *xd;

    cudaMalloc(&xd, NCHARS);
    dim3 grid(3,2);
    setValues<<<grid,1>>>(xd);

    char *p;
    p = (char*) malloc(20*sizeof(char));
    strcpy(p,"");

    cudaMemcpy(p, xd, NCHARS, cudaMemcpyDeviceToHost);
    p[NCHARS]='\0';

    printf("<%s>\n", p);
    getchar();

    cudaFree(xd);

    return 0;
}

答案 2 :(得分:1)

我在这里看到几个问题。以下是一些最明显的问题:

首先,我的猜测是字符串常量“4”存储在主机(CPU)内存中,因此您必须将其显式复制到设备(全局)内存。一旦字符串“4”在设备存储器中,然后就可以在设备存储器值中存储指向“4”的指针,例如数组arr的元素。

其次,传递给x内核的数组setValues也在主机内存中。请记住,您需要使用cudaMalloc来分配(全局)设备内存区域,然后设备内核可以指向该区域。