Question

我有一个cuda内核函数来交换struct数组中的元素，但是当struct元素足够大（例如，占用超过120字节）时，交换是错误的。这是一个简单的例子：

#include <iostream>
#include <cuda_runtime.h> 

using namespace std;

const int f_num = 30;
const int d_num = 15;

struct S
{
    int constID;
    float f[f_num];
    //double d[d_num];
};

__global__ void
cudaSwap(S *s, int n)
{
    int tid = threadIdx.x;
    S temp;
    if(tid < n)
    {
        temp = s[tid];
        s[tid] = s[tid + n];
        s[tid +n] = temp;
    }
}
int main()
{
    cout << "sizeof float is " << sizeof(float) << endl;
    cout << "sizeof double is " << sizeof(double) << endl;

    S *h_s = new S[20];
    for(int i = 0; i < 20; ++i)
    {
        h_s[i].constID = i;
        for(int j = 0; j < f_num; ++j)
        {
            h_s[i].f[j] = (float)i + (float)j/100;
            //h_s[i].d[j] = (double)i + (double)j/100;
        }
    }

    cout << "original h_s:" << endl;
    for(int i = 0; i < 20; ++i)
    {
        cout << h_s[i].constID << endl;
    }
    cout << endl;

    S *d_s;
    cudaMalloc((void**)&d_s, sizeof(S) * 20);
    cudaMemset(d_s, 0, sizeof(S) * 20);
    cudaMemcpy(d_s, h_s, sizeof(S) * 20, cudaMemcpyHostToDevice);

    cudaSwap<<<1,20>>>(d_s, 5);

    cudaMemcpy(h_s, d_s, sizeof(S) * 20, cudaMemcpyDeviceToHost);
    cout << "swaped h_s:" << endl;
    for(int i = 0; i < 20; ++i)
    {
        cout << h_s[i].constID << endl;
    }
    cout << endl;

    delete [] h_s;
    cudaFree(d_s);

    return 0;
}

当结构由少于30个浮点元素或15个双元素组成时，结果为5 6 7 8 9 0 1 2 3 4 10 11 ...，但当struct元素较大时，结果为5 6 7 8 9 5 6 7 8 9 10 11 ...，这意味着s[tid +n] = temp;无效。我是cuda的新手，谁能告诉我问题的原因以及如何解决问题？也许它与注册有关？我不确定... 非常感谢！

Answer 1

这似乎是CUDA 7.5和CUDA 8中的一个编译器错误（根据我的测试，它不受PTX优化级别的影响，所以我相信这个bug会从CUDA源代码生成PTX，而不是编译PTX到SASS）。

使用调试开关（-G）进行编译似乎会使问题消失，但这会对性能产生负面影响。

根据我的测试，它似乎已在CUDA 9 EA中修复，因此我希望在公开发布时将其修复到CUDA 9中。

一种可能的简单解决方法是修改内核代码，如下所示：

__global__ void
cudaSwap(S *s, int n)
{
    int tid = threadIdx.x;
    S temp;
    if(tid < n)
    {
        temp = s[tid];
        s[tid] = s[tid + n];
        memcpy(s+tid+n, &temp, sizeof(S));  // this line is changed
    }
}

根据我的测试，这似乎可以解决CUDA 8中此处提出的问题。

带有大结构变量的cuda内核函数会产生错误的结果

1 个答案: