cudaMemcpyFromSymbol()无法正常工作

时间:2015-11-02 16:22:26

标签: cuda

我在从GPU的设备内存中读取时遇到问题。当我将值复制到__device__内存时,一切正常! 但是当我试图得到结果时,答案有时候是好的,有时候恰好是数组的第一个值!

我有一个像这样的设备数组:

__device__ array[50];

一开始我将一些值复制到:

cudaStatus = cudaMemcpyToSymbol(dev_state, &CipherState, statesize, 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

在内核中进行一些更改后, 我尝试从数组中读取值:

Kernel << <8, 16 >> >();

unsigned char CipherState2[50];

cudaStatus = cudaMemcpyFromSymbol(&CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) 
    {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    } 

结果有时为TRUE,有时是数组的第一个值。

以下是我的更多代码:

//before Kernel Function body

__device__ unsigned char dev_state[128];

//////////////////////////////////////

void test()
{

    unsigned char CipherState[128];

    for (int i = 0; i<128; i++)                 
        CipherState[i] = 0x01;

    cudaError_t cudaStatus;

    cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, 128*sizeof(unsigned char), 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

    printf("\n initialized:\n 0x");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j =0 ; j <=15; j++)
        {
            printf("%x", CipherState[i+j]);
        }
    }
    // set all of the dev_state to "0x05"
    Kernel << <8, 16 >> >();

//  until this line, everythings OK

unsigned char CipherState2[128];    
cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 128*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) 
{
    printf(" \n%s\n", cudaGetErrorString(cudaStatus));
    getchar();
}


    printf("\n State at the end:\n ");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j = 0; j <= 15; j++)
            printf("%x",  CipherState2[i + j]);

    }
  }

有时,打印cipherstate2得到这个:

0x55555555555555555 ...... 5555555555

有时候:

0x11111111111111111 ..... 11111111111;

1 个答案:

答案 0 :(得分:2)

这是不正确的:

unsigned char CipherState2[50];

cudaStatus = cudaMemcpyFromSymbol(&CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
                                  ^

CipherState2已经是一个指针。你不应该拿它的地址。相反,你应该像这样打电话:

cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);

虽然你没有显示CipherState变量的样子,但你很可能在这里犯了类似的错误:

cudaStatus = cudaMemcpyToSymbol(dev_state, &CipherState, statesize, 0, cudaMemcpyHostToDevice);
                                           ^

该调用的正确形式很可能是:

cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, statesize, 0, cudaMemcpyHostToDevice);

将来,请提供MCVE这样的问题。

举个例子,请注意这不是有效的代码:

__device__ array[50];

也许你的意思是这样的:

__device__ unsigned char dev_state[50];

编辑:您现在发布的代码(在答案中)仍然不完整,但似乎大部分都是正确的。剩下的问题可能在您未显示的内核中,或者您的CUDA安装可能无法正常工作。这是一个完全工作的代码,围绕你所展示的内容(我添加了一个简单的内核)来演示预期的行为(请注意,打印输出的for循环构造不正确,我不认为):

$ cat t966.cu
#include <stdio.h>
//before Kernel Function body

__device__ unsigned char dev_state[128];

//////////////////////////////////////

__global__ void Kernel(){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < 128) dev_state[idx] = 0x5;
}

void test()
{

    unsigned char CipherState[128];

    for (int i = 0; i<128; i++)
        CipherState[i] = 0x01;

    cudaError_t cudaStatus;

    cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, 128*sizeof(unsigned char), 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

    printf("\n initialized:\n 0x");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j =0 ; j <=15; j++)
        {
            printf("%x", CipherState[i+j]);
        }
    }
    // set all of the dev_state to "0x05"
    Kernel << <8, 16 >> >();

//  until this line, everythings OK

unsigned char CipherState2[128];
cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 128*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
    printf(" \n%s\n", cudaGetErrorString(cudaStatus));
    getchar();
}


    printf("\n State at the end:\n ");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j = 0; j <= 15; j++)
            printf("%x",  CipherState2[i + j]);

    }
  printf("\n");
}
int main(){

  test();
}
$ nvcc t966.cu -o t966
$ cuda-memcheck ./t966
========= CUDA-MEMCHECK

 initialized:
 0x
0x1111111111111111
 State at the end:

0x5555555555555555
========= ERROR SUMMARY: 0 errors
$