Question

当我尝试将节点数组从设备复制回主机时，我在Node.m [...]而不是值中得到零，即使我在内核中打印节点时它显示正确设置了值。不幸的是我无法自己发现任何错误，所以我请求你帮忙。我使用visual studio编译器和计算功能编译代码3.来自this的代码答案对我有用。

我粘贴整个代码，但只有有意义的部分

__global__ void divideLeft(Node * nodes,float * leftSide){...}

和

divideLeft<<<1,1>>>(dNodes,dLeftSide);
ERRCHECK(cudaDeviceSynchronize());
ERRCHECK(cudaGetLastError());
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
printNode(nodes[3]);

 #include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <conio.h>
#include <new>
#include <cmath>

#define ERRCHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true,bool wait=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (wait) getch();
      if (abort) exit(code);
   }
}

#define MSIZE 36
#define INPUT_SIZE(N) N*5 - 3*2
#define PARENT(i) (i-1)/2
#define LEFT(i) 2*i + 1
#define RIGHT(i) 2*i + 2
#define BOTTOM_HEAP_NODES_COUNT(N) (N-2)/3 //size of input must be 2+3n,n>1
#define HEAP_SIZE(N) 2*BOTTOM_HEAP_NODES_COUNT(N)-1 
#define FIRST_LEVEL_SIZE 19
#define ROW_LENGTH 5
#define FIRST_LVL_MAT_SIZE 5
#define XY(x,y) x*6+y

__constant__ int dHigherTreeLevelThreshold;
__constant__ int dNodesCount;
__constant__ int dLeftSize;
__constant__ int dHeapSize;
__constant__ int dBottomNodes;
__constant__ int dRemainingNodes;
__constant__ int dRightCols;
__constant__ int dInputCount;

struct Node
{
    float m[MSIZE];
    float *x;
};

__device__ __host__ void printNode(Node node);
__global__ void divideLeft(Node * nodes,float * leftSide)
{
    int idx = blockIdx.x*blockDim.x + threadIdx.x;
    if(idx>=dBottomNodes)
        return;
    int nodeIdx = idx + dRemainingNodes - (idx >= dHigherTreeLevelThreshold)*dBottomNodes;
//  printf("%d %d\n",idx,nodeIdx);
    Node node = nodes[nodeIdx];
    idx*=5*3;
    node.m[XY(3,3)] = leftSide[idx+2]/3;
    node.m[XY(3,2)] = leftSide[idx+3]/2;
    node.m[XY(3,1)] = leftSide[idx+4];

    node.m[XY(2,3)] = leftSide[idx+6]/2;
    node.m[XY(2,2)] = leftSide[idx+7]*2/3;
    node.m[XY(2,1)] = leftSide[idx+8];
    node.m[XY(2,4)] = leftSide[idx+9];

    node.m[XY(1,3)] = leftSide[idx+10];
    node.m[XY(1,2)] = leftSide[idx+11];
    node.m[XY(1,1)] = leftSide[idx+12];
    node.m[XY(1,4)] = leftSide[idx+13];
    node.m[XY(1,5)] = leftSide[idx+14];

    node.m[XY(4,2)] = leftSide[idx+15];
    node.m[XY(4,1)] = leftSide[idx+16];
    node.m[XY(4,4)] = leftSide[idx+17]*2/3;
    node.m[XY(4,5)] = leftSide[idx+18]/2;

    node.m[XY(5,1)] = leftSide[idx+20];
    node.m[XY(5,4)] = leftSide[idx+21]/2;
    node.m[XY(5,5)] = leftSide[idx+22]/3;
    printNode(node);
}

void leftSideInit(float * leftSide,int size)
{
    for(int i = 0;i<size;i++)
    {
        leftSide[i] = 1;//(i+1)%26;
    }
}

int main(){
    ERRCHECK(cudaSetDevice(0));

    int leftCount = 11;
    int leftSize = leftCount*5;
    int rightSize = 10;
    int heapSize = HEAP_SIZE(leftCount);
    int bottomNodes = BOTTOM_HEAP_NODES_COUNT(leftCount);
    int greatestPowerOfTwo = pow(2,(int)log2(bottomNodes));
    int remainingNodes = heapSize - greatestPowerOfTwo;

    ERRCHECK(cudaMemcpyToSymbol(dBottomNodes,&bottomNodes,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dHigherTreeLevelThreshold,&greatestPowerOfTwo,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dRemainingNodes,&remainingNodes,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dRightCols,&rightSize,sizeof(int)));
    ERRCHECK(cudaMemcpyToSymbol(dHeapSize,&heapSize,sizeof(int)));

    float * leftSide = new float[leftSize];
    float * rightSide = new float[rightSize];
    Node * nodes = new Node[heapSize];
    Node * dNodes = nullptr;
    float * dLeftSide =nullptr;
    leftSideInit(leftSide,leftSize);

    ERRCHECK(cudaMalloc(&dNodes,sizeof(Node)* heapSize));
    ERRCHECK(cudaMemset(dNodes,0,sizeof(Node)*heapSize));
    ERRCHECK(cudaMalloc(&dLeftSide,leftSize*sizeof(float)));
    ERRCHECK(cudaMemcpy(dLeftSide,leftSide,leftSize*sizeof(float),cudaMemcpyHostToDevice));
    divideLeft<<<1,1>>>(dNodes,dLeftSide);
    ERRCHECK(cudaDeviceSynchronize());
    ERRCHECK(cudaGetLastError());
    ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
    printNode(nodes[3]);
    delete [] nodes;
    cudaFree(dNodes);

    ERRCHECK(cudaDeviceReset());

    getch();
    return 0;
}

__device__ __host__ void printNode(Node node)
{   
    for(int i= 0;i<6;i++)
        printf("%.3f %.3f %.3f %.3f %.3f %.3f\n",node.m[XY(i,0)],node.m[XY(i,1)],node.m[XY(i,2)],node.m[XY(i,3)],node.m[XY(i,4)],node.m[XY(i,5)]);

}

Answer 1

在你的内核中，你制作了一份你正在处理的Node的本地副本：

Node node = nodes[nodeIdx];

内核的其余部分继续修改node的元素，即本地副本。

但是在完成所有修改之后，您永远不会将本地副本复制回全局副本，因此全局副本保持不变。

要解决此问题，一种可能性是在内核末尾添加此行：

nodes[nodeIdx] = node;

顺便说一句，我注意到你的struct Node包含一个指针变量：

struct Node
{
    float m[MSIZE];
    float *x;
};

您应该意识到使用带有嵌入指针的结构数组可能会有一些特殊的复杂性。您还没有真正使用该变量（x），所以我只是将其作为评论。您可能需要参考cuda tag info page以获取有关此概念的规范问题（＆＃34;在CUDA和＃34中使用指针数组;）。

cudaMemcpy struct device托管不起作用

1 个答案: