Question

我尝试使用2个不同的流，如“CUDA By Example”一书中所示，并将其应用于某些代码，不幸的是，当我运行它时，我获得了分段错误。我以前编写过没有流的代码，但是它工作正常，但我在这里看不到问题。

#include <cuda.h>
#include <stdio.h>


__global__ 
 void GPU(node *tree ,char *data,int *out){
    int  tid =  blockIdx.x * blockDim.x + threadIdx.x;
 }



void streamTest(wrapp * wrap, char *data){

    int size = wrap->size;
    nodes *tree = wrap->nodes;

    char *data_d0;
    nodes *tree_d0;
    int *out_d0;

    char *data_d1;
    nodes *tree_d1;
    int *out_d1;

    char *data_h;
    nodes *tree_h;
    int *out_h;

    const int N = 100000000/100;

    cudaStream_t stream0, stream1;

     cudaMalloc((void **)&data_d0, N * sizeof(char));
     cudaMalloc((void **)&tree_d0, (wrap->size*sizeof(nodes)));
     cudaMalloc((void **)&out_d0,  sizeof(int));


     cudaMalloc((void **)&data_d1, N * sizeof(char));
     cudaMalloc((void **)&tree_d1, (wrap->size*sizeof(nodes)));
     cudaMalloc((void **)&out_d1,  sizeof(int));


     cudaHostAlloc((void**)&data_h, 100000000*(sizeof(char)),cudaHostAllocDefault);
     cudaHostAlloc((void**)&tree_h, wrap->size*(sizeof(nodes)),cudaHostAllocDefault);
     cudaHostAlloc((void**)&out_h, (sizeof(int)),cudaHostAllocDefault);



    int x;
    for(x=0; x<100000000; x++){
        data_h[x] = data_h[x];
    }

    int z;
    for(x=0; x<wrap->size;x++){
        tree_h[x].value = wrap->nodes[x].value;

        for(z=0; z<32; z++){
          tree_h[x].array[z] = wrap->nodes[x].array[z];  
        }
    }

    for(x=0; x<100000000; x+=N*2){

         cudaMemcpyAsync(data_d0, data_h+x, N*sizeof(char), cudaMemcpyHostToDevice, stream0);
         cudaMemcpyAsync(data_d1, data_h+x+N, N*sizeof(char), cudaMemcpyHostToDevice, stream1);


         cudaMemcpyAsync(tree_d0, tree_h, wrap->size*sizeof(nodes), cudaMemcpyHostToDevice, stream0);
         cudaMemcpyAsync(tree_d1, tree_h, wrap->size*sizeof(nodes), cudaMemcpyHostToDevice, stream1);

        GPU<<<256,256,0,stream0>>>(tree_d0, data_d0, out_d0 );
        GPU<<<256,256,0,stream1>>>(tree_d1, data_d1, out_d1);

    } 

}

int main(void){

    char *data = (char *)calloc('a', 100000000 *sizeof(char));

    nodes *node = (nodes *) malloc(sizeof(nodes));

    streamTest(wrap, data);
}

当我尝试运行时，我获得：

./a.out Segmentation fault (core dumped)

当我检查var/log/kern.log时，我可以看到以下内容：

a.out[20204]: segfault at 4 ip 00007fd26303f92c sp 00007fff7694efb8 error 4 in libcuda.so.331.49[7fd262e09000+b6f000]

Answer 1

要使用streams，您必须先创建它们。

当我像这样修改你的代码时：

 cudaStream_t stream0, stream1;
 cudaStreamCreate(&stream0);    // add this line
 cudaStreamCreate(&stream1);    // add this line

对我来说，段错误消失了。

使用CUDA流时的Segfault

1 个答案: