Question

我在从GPU的设备内存中读取时遇到问题。当我将值复制到__device__内存时，一切正常！但是当我试图得到结果时，答案有时候是好的，有时候恰好是数组的第一个值！

我有一个像这样的设备数组：

__device__ array[50];

一开始我将一些值复制到：

cudaStatus = cudaMemcpyToSymbol(dev_state, &CipherState, statesize, 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

在内核中进行一些更改后，我尝试从数组中读取值：

Kernel << <8, 16 >> >();

unsigned char CipherState2[50];

cudaStatus = cudaMemcpyFromSymbol(&CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) 
    {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

结果有时为TRUE，有时是数组的第一个值。

以下是我的更多代码：

//before Kernel Function body

__device__ unsigned char dev_state[128];

//////////////////////////////////////

void test()
{

    unsigned char CipherState[128];

    for (int i = 0; i<128; i++)                 
        CipherState[i] = 0x01;

    cudaError_t cudaStatus;

    cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, 128*sizeof(unsigned char), 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

    printf("\n initialized:\n 0x");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j =0 ; j <=15; j++)
        {
            printf("%x", CipherState[i+j]);
        }
    }
    // set all of the dev_state to "0x05"
    Kernel << <8, 16 >> >();

//  until this line, everythings OK

unsigned char CipherState2[128];    
cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 128*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) 
{
    printf(" \n%s\n", cudaGetErrorString(cudaStatus));
    getchar();
}


    printf("\n State at the end:\n ");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j = 0; j <= 15; j++)
            printf("%x",  CipherState2[i + j]);

    }
  }

有时，打印cipherstate2得到这个：

0x55555555555555555 ...... 5555555555

有时候：

0x11111111111111111 ..... 11111111111;

Answer 1

这是不正确的：

unsigned char CipherState2[50];

cudaStatus = cudaMemcpyFromSymbol(&CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
                                  ^

CipherState2已经是一个指针。你不应该拿它的地址。相反，你应该像这样打电话：

cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 50*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);

虽然你没有显示CipherState变量的样子，但你很可能在这里犯了类似的错误：

cudaStatus = cudaMemcpyToSymbol(dev_state, &CipherState, statesize, 0, cudaMemcpyHostToDevice);
                                           ^

该调用的正确形式很可能是：

cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, statesize, 0, cudaMemcpyHostToDevice);

将来，请提供MCVE这样的问题。

举个例子，请注意这不是有效的代码：

__device__ array[50];

也许你的意思是这样的：

__device__ unsigned char dev_state[50];

编辑：您现在发布的代码（在答案中）仍然不完整，但似乎大部分都是正确的。剩下的问题可能在您未显示的内核中，或者您的CUDA安装可能无法正常工作。这是一个完全工作的代码，围绕你所展示的内容（我添加了一个简单的内核）来演示预期的行为（请注意，打印输出的for循环构造不正确，我不认为）：

$ cat t966.cu
#include <stdio.h>
//before Kernel Function body

__device__ unsigned char dev_state[128];

//////////////////////////////////////

__global__ void Kernel(){
  int idx = threadIdx.x+blockDim.x*blockIdx.x;
  if (idx < 128) dev_state[idx] = 0x5;
}

void test()
{

    unsigned char CipherState[128];

    for (int i = 0; i<128; i++)
        CipherState[i] = 0x01;

    cudaError_t cudaStatus;

    cudaStatus = cudaMemcpyToSymbol(dev_state, CipherState, 128*sizeof(unsigned char), 0, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        printf(" \n%s\n", cudaGetErrorString(cudaStatus));
        getchar();
    }

    printf("\n initialized:\n 0x");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j =0 ; j <=15; j++)
        {
            printf("%x", CipherState[i+j]);
        }
    }
    // set all of the dev_state to "0x05"
    Kernel << <8, 16 >> >();

//  until this line, everythings OK

unsigned char CipherState2[128];
cudaStatus = cudaMemcpyFromSymbol(CipherState2, dev_state, 128*sizeof(unsigned char),0, cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
    printf(" \n%s\n", cudaGetErrorString(cudaStatus));
    getchar();
}


    printf("\n State at the end:\n ");
    for (size_t i = 0; i < 16; i+=16)
    {
        if (i % 16 == 0)
            printf("\n0x");
        for (int j = 0; j <= 15; j++)
            printf("%x",  CipherState2[i + j]);

    }
  printf("\n");
}
int main(){

  test();
}
$ nvcc t966.cu -o t966
$ cuda-memcheck ./t966
========= CUDA-MEMCHECK

 initialized:
 0x
0x1111111111111111
 State at the end:

0x5555555555555555
========= ERROR SUMMARY: 0 errors
$

cudaMemcpyFromSymbol（）无法正常工作

1 个答案: