Question

我们已成功使用以下帖子来帮助创建包含基本类型（如int *）的结构。纹理为只读数组提供了良好的性能提升。我们使用它们中的许多，这使得内核和内核子函数的参数列表变得冗长而复杂。我们希望将Textures嵌入到结构中以减少参数长度和复杂性。

Copying a struct containing pointers to CUDA device

这是代表我们使用的代码方法的代码段。它编译，但在运行时崩溃。

// Initialize texture description
memset(&textureDescription, 0, sizeof(textureDescription));
textureDescription.readMode = cudaReadModeElementType;

// Create Texture from variable
cudaTextureObject_t texture = 0;
cudaResourceDesc resource;
memset(&resource, 0, sizeof(resource));
resource.resType = cudaResourceTypeLinear;
resource.res.linear.devPtr = intArray;
resource.res.linear.desc.f = cudaChannelFormatKindSigned;
resource.res.linear.desc.x = 32; // bits per channel
resource.res.linear.sizeInBytes = count*sizeof(int);
cudaCreateTextureObject(&texture, resource, &textureDescription, NULL);

// These declarations are in the .h file
typedef struct SampleStructure {
   cudaTextureObject_t texture;
} SampleStructure;
SampleStructure *structureHost;
SampleStructure *structureDevice;

// Create host and device structures
structureHost = (SampleStructure *)malloc(sizeof(SampleStructure));
cudaMalloc(&structureDevice, sizeof(SampleStructure));

// Assign the texture object to the host structure
structureHost->texture = texture;

// Copy the host structure to Global Memory
cudaMemcpy(structureDevice, structureHost, sizeof(SampleStructure), cudaMemcpyHostToDevice));

// Pass Texture and Texture-embedded-in-structure to kernel
kenerl<<<1,1>>>(texture, structureDevice);

...
__global__ void
kernel(cudaTextureObject_t texture, SampleStructure *structureDevice) {
    value = tex1Dfetch<int>(texture, index); // Runs successfully at runtime
    value = tex1Dfetch<int>(structureDevice->texture, index); // Crashes at runtime
}

在内核代码（或子功能）中使用“texture”变量时，它运行正常。当使用“structureDevice-＆gt; texture”时，它会在运行时崩溃。

有人可以显示一个简单的代码，显示如何在传递给内核的结构中成功嵌入纹理对象并运行而不会崩溃吗？或者有人可以指出我们提交的代码中的错误可能在哪里？

Answer 1

按值传递结构得到了有效的解决方案。以下是使其工作的代码等价物。感谢@talonmies的建议。

虽然结构可以简化参数列表，但它可以减慢执行速度，因为系统必须对全局内存进行2次调用，而不是1：1调用以获取结构，1次调用以获取纹理。为了提高性能，可以将结构复制到共享内存。在共享内存中使用该结构可以提高性能。

// Create the Texture Object
cudaResourceDesc resource;
memset(&resource, 0, sizeof(resource));
resource.resType = cudaResourceTypeLinear;
resource.res.linear.devPtr = intArray;
resource.res.linear.desc.f = cudaChannelFormatKindSigned;
resource.res.linear.desc.x = 32; // bits per channel
resource.res.linear.sizeInBytes = count*sizeof(int);
cudaCreateTextureObject(&texture, resource, &textureDescription, NULL);

// These structure declarations are in the .h file
typedef struct SampleStructure {
   cudaTextureObject_t texture;
} SampleStructure;
SampleStructure structureHost;

// Assign the texture object to the host structure
structureHost.texture = texture;

// Pass Texture and Texture-object-embedded-in-structure to kernel
kenerl<<<1,1>>>(texture, structureHost);

...
__global__ void
kernel(cudaTextureObject_t texture, SampleStructure structureDevice) {
    __shared__ SampleStructure structureSharedMemory;

    // Copy the structure to shared memory for faster access
    if (threadIdx.x == 0)
       structureSharedMemory = structureDevice;
    __threadfence_block();

    value = tex1Dfetch<int>(texture, index); // Runs successfully at runtime
    value = tex1Dfetch<int>(structureSharedMemory.texture, index); // Runs successfully at runtime
}

如何在结构中嵌入CUDA纹理对象？

1 个答案: