Question

我注意到cuda中有一个float1结构类型。与简单float相比，是否有任何性能优势，例如，如果使用float array vs float1 array？

struct __device_builtin__ float1
{
    float x;
};

在float4中，由于对齐方式为4x4bytes = 16bytes，因此具有性能优势。它仅用于具有__device__参数的float1函数中的特殊用途吗？

提前致谢。

Answer 1

关注@ talonmies＆＃39;对帖子CUDA Thrust reduction with double2 arrays发表评论，我比较了使用CUDA推力计算向量范数并在float和float1之间切换。我在GT210卡（cc 1.2）上考虑了一系列N=1000000元素。似乎对于两种情况，规范的计算花费完全相同的时间，即大约3.4s，因此没有性能改进。从下面的代码中可以看出，使用float可能比使用float1时更加轻松。

最后，请注意float4的优势源于对齐__builtin__align__，而不是__device_builtin__。

#include <thrust\device_vector.h>
#include <thrust\transform_reduce.h>

struct square
{
    __host__ __device__ float operator()(float x)
    {
        return x * x;
    }
};

struct square1
{
    __host__ __device__ float operator()(float1 x)
    {
        return x.x * x.x;
    }
};

void main() {

    const int N = 1000000;

    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::device_vector<float> d_vec(N,3.f);

    cudaEventRecord(start, 0);
    float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time reduction:  %3.1f ms \n", time);

    printf("Result of reduction = %f\n",reduction);

    thrust::host_vector<float1>   h_vec1(N);
    for (int i=0; i<N; i++) h_vec1[i].x = 3.f;
    thrust::device_vector<float1> d_vec1=h_vec1;

    cudaEventRecord(start, 0);
    float reduction1 = sqrt(thrust::transform_reduce(d_vec1.begin(), d_vec1.end(), square1(), 0.0f, thrust::plus<float>()));
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("Elapsed time reduction1:  %3.1f ms \n", time);

    printf("Result of reduction1 = %f\n",reduction1);

    getchar();

}

在CUDA中float1 vs float

1 个答案: