Question

我一直在尝试使用溪流和一维纹理，但每次看里面时纹理似乎都是空的。一开始我的计划是使用2个流，但是我无法访问纹理，因此我将流的数量减少到1（用于调试），将内核减少到1个1个块的线程，如您所见下方。

#include <stdio.h>
#include <string.h>
#include <cuda.h>

texture <int,1,cudaReadModeElementType> tex1;

__global__
void textureTest(int *out){
    int  tid =  blockIdx.x * blockDim.x + threadIdx.x;
    float x;
    int i;
    for(i=0; i<30*8; i++){
        x = tex1Dfetch(tex1, i);
        printf("%d: %d \n ",i,x);
    }
    out[0]=x;
}

void testTextureCPU(){
    const int N = 100/2;
    int *array_d0;
    int *array_d1;
    int *array_h;
    int x=0;
    int *out_d0 =(int *)calloc(1, sizeof(int));
    int *out_d1 =(int *)calloc(1, sizeof(int));
    int *out_h =(int *)calloc(2, sizeof(int));

    cudaStream_t stream0, stream1;
    cudaStreamCreate(&stream0);
    cudaStreamCreate(&stream1);   

    cudaHostAlloc((void**)&array_d0, (30 * 8*sizeof(int)),cudaHostAllocDefault);
    cudaHostAlloc((void**)&array_d1, (30 * 8*sizeof(int)),cudaHostAllocDefault);
    cudaHostAlloc((void**)&array_h, (30 * 8*sizeof(int)),cudaHostAllocDefault);

    cudaMalloc((void **)&out_d0,  1 *sizeof(int));
    cudaMalloc((void **)&out_d1,  1 *sizeof(int));
    cudaHostAlloc((void**)&out_h, (2*sizeof(int)),cudaHostAllocDefault);


    array_h[8 * 10 + 0] = 10;
    array_h[8 * 11 + 1] = 11;
    array_h[8 * 12 + 2] = 12;
    array_h[8 * 13 + 3] = 13;
    array_h[8 * 14 + 4] = 14;
    array_h[8 * 15 + 5] = 15;
    array_h[8 * 16 + 6] = 16;
    array_h[8 * 17 + 7] = 17;

    for(x=0; x<2; x++){

        cudaMemcpyAsync(array_d0, array_h, (30 * 8*sizeof(int)), cudaMemcpyHostToDevice, stream0);
        cudaMemcpyAsync(array_d1, array_h, (30 * 8*sizeof(int)), cudaMemcpyHostToDevice, stream1);

        cudaBindTexture(NULL,tex1,array_d0, (30 * 8 *sizeof(int)));

        textureTest<<<1,2,0,stream0>>>(out_d0);

        cudaBindTexture(NULL,tex1,array_d0, (30 * 8 *sizeof(int)));

        textureTest<<<1,2,0,stream1>>>(out_d1);

        cudaMemcpyAsync(out_h+x, out_d0 , 1 * sizeof(int), cudaMemcpyDeviceToHost, stream0);
        cudaMemcpyAsync(out_h+x+N, out_d1 ,1 * sizeof(int), cudaMemcpyDeviceToHost, stream1);
    }
} 

int main(void){
    testTextureCPU();
    return 0;
}

但我无法弄清楚这段代码有什么问题，以及如何让它适用于一个或多个流。

Answer 1

您编辑的代码包含许多绝对基本错误，这些错误与纹理或它们在流中的使用无关：

在内核中，你有一个破损的printf语句，它将浮点值视为整数
在主机代码中，用于填充纹理的主机内存大部分未初始化
在主机循环中，第二次cudaMemcpyAsync调用有一个可怕的缓冲区溢出

如果您修复了这三件事，代码将按预期工作。我建议将来更多地关注代码的质量。

CUDA中的tex1Dfetch和Streams

1 个答案: