Question

cublasSaxpy计算y＆＃39; = a * x + y，其中x和y是向量，a是标量。

事实证明我需要计算y＆＃39; = a * y + x而不是。我没有看到如何扭曲cuBLAS库来做到这一点。

（当然，我可以计算y＆＃39; = a * y，然后y＆＃39; = y＆＃39; + x，但在这种情况下，y＆＃39;经常被读取。我可以写我的拥有自己的CUDA代码，但它可能不会像cuBLAS代码那样快。我只是感到惊讶，没有明显的方法可以做到＃34; saypx＆＃34;直接。）

[已添加]有类似＆＃34; saxpby＆＃34;的功能在英特尔的cblas版本中，它可以满足我的需求。但奇怪的是，那不是cuBLAS。

[已添加＃2]看起来我可以使用cudnnAddTensor函数，带有一些别名的描述符（我有一个指向张量的FilterDescriptor，AddTensor赢了不接受，但我应该能够别名TensorDescriptor具有相同的记忆和形状。）

Answer 1

我没有办法知道你在CUBLAS中做什么，也没有标准的BLAS。您在MKL中找到的是英特尔添加的扩展，但我不记得在其他主机和加速器BLAS实现中看到类似的内容。

好消息是你的断言“我可以编写自己的CUDA代码来执行此操作，但随后它可能不会像cuBLAS代码那样快”，这是不真实的，至少对于像saxpy这样微不足道的操作而言。即使是一个天真的saxpy实现也会非常接近CUBLAS，因为实际上没有多少人能够读取两个数组，执行FMAD并回写结果。只要你使内存合并正确，编写高性能代码就非常简单了。例如：

#include <vector>
#include <algorithm>
#include <cassert>
#include <iostream>
#include <cmath>

#include "cublas_v2.h"

typedef enum
{ 
    AXPY = 0,
    AXPBY = 1
} saxpy_op_t;

__device__ __host__ __inline__ 
float axpby_op(float y, float x, float a)
{
    return a * y + x;
}

__device__ __host__ __inline__ 
float axpy_op(float y, float x, float a)
{
    return y + a * x;
}

template<typename T>
class pitched_accessor
{
    T * p;
    size_t pitch;

    public:
    __host__ __device__
    pitched_accessor(T *p_, size_t pitch_) : p(p_), pitch(pitch_) {};

    __host__ __device__
    T& operator[](size_t idx) { return p[pitch*idx]; };

    __host__ __device__ 
    const T& operator[](size_t idx) const { return p[pitch*idx]; };
};


template<saxpy_op_t op>
__global__ 
void saxpy_kernel(pitched_accessor<float> y, pitched_accessor<float> x, 
                  const float a, const unsigned int N1)
{
    unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int stride = gridDim.x * blockDim.x;

    #pragma unroll 8
    for(; idx < N1; idx += stride) {
        switch (op) {
            case AXPY:
                y[idx] = axpy_op(y[idx], x[idx], a);
                break;
            case AXPBY:
                y[idx] = axpby_op(y[idx], x[idx], a);
                break;
        }
    }
}

__host__ void saxby(const unsigned int N, const float a, 
                    float *x, int xinc, float *y, int yinc)
{
    int gridsize, blocksize;
    cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPBY>);
    saxpy_kernel<AXPBY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc), 
                                                 pitched_accessor<float>(x, xinc), a, N);
}

__host__ void saxpy(const unsigned int N, const float a, 
                    float *x, int xinc, float *y, int yinc)
{
    int gridsize, blocksize;
    cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, saxpy_kernel<AXPY>);
    saxpy_kernel<AXPY><<<gridsize, blocksize>>>(pitched_accessor<float>(y, yinc), 
                                                pitched_accessor<float>(x, xinc), a, N);
}

void check_result(std::vector<float> &yhat, float result, float tolerance=1e-5f)
{
    auto it = yhat.begin();
    for(; it != yhat.end(); ++it) {
        float err = std::fabs(*it - result);
        assert( err < tolerance ); 
    }
}

int main()
{

    const int N = 1<<22;

    std::vector<float> x_h(N);
    std::vector<float> y_h(N);

    const float a = 2.f, y0 = 1234.f, x0 = 532.f;
    std::fill(y_h.begin(), y_h.end(), y0);
    std::fill(x_h.begin(), x_h.end(), x0);

    float *x_d, *y_d;
    size_t sz = sizeof(float) * size_t(N);
    cudaMalloc((void **)&x_d, sz);
    cudaMalloc((void **)&y_d, sz);

    cudaMemcpy(x_d, &x_h[0], sz, cudaMemcpyHostToDevice);

    {
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        saxby(N, a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpby_op(y0, x0, a));
    }

    {
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        saxpy(N, a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpy_op(y0, x0, a));
    }

    {
        cublasHandle_t handle;
        cublasCreate(&handle);
        cudaMemcpy(y_d, &y_h[0], sz, cudaMemcpyHostToDevice);
        cublasSaxpy(handle, N, &a, x_d, 1, y_d, 1);
        std::vector<float> yhat(N);
        cudaMemcpy(&yhat[0], y_d, sz, cudaMemcpyDeviceToHost);
        check_result(yhat, axpy_op(y0, x0, a));
        cublasDestroy(handle);
    }

    return int(cudaDeviceReset());
}

这表明一个非常简单的axpy内核可以很容易地适应标准操作和你想要的版本，并且在我测试的计算5.2设备上运行CUBLAS运行时的10％：

$ nvcc -std=c++11 -arch=sm_52 -Xptxas="-v" -o saxby saxby.cu -lcublas
ptxas info    : 0 bytes gmem
ptxas info    : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info    : Function properties for _Z12saxpy_kernelIL10saxpy_op_t0EEv16pitched_accessorIfES2_fj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 360 bytes cmem[0]
ptxas info    : Compiling entry function '_Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj' for 'sm_52'
ptxas info    : Function properties for _Z12saxpy_kernelIL10saxpy_op_t1EEv16pitched_accessorIfES2_fj
    0 bytes stack frame, 0 bytes spill stores, 0 bytes spill loads
ptxas info    : Used 17 registers, 360 bytes cmem[0]

$ nvprof ./saxby
==26806== NVPROF is profiling process 26806, command: ./saxby
==26806== Profiling application: ./saxby
==26806== Profiling result:
Time(%)      Time     Calls       Avg       Min       Max  Name
 54.06%  11.190ms         5  2.2381ms     960ns  2.9094ms  [CUDA memcpy HtoD]
 40.89%  8.4641ms         3  2.8214ms  2.8039ms  2.8310ms  [CUDA memcpy DtoH]
  1.73%  357.59us         1  357.59us  357.59us  357.59us  void saxpy_kernel<saxpy_op_t=1>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
  1.72%  355.15us         1  355.15us  355.15us  355.15us  void saxpy_kernel<saxpy_op_t=0>(pitched_accessor<float>, pitched_accessor<float>, float, unsigned int)
  1.60%  332.21us         1  332.21us  332.21us  332.21us  void axpy_kernel_val<float, int=0>(cublasAxpyParamsVal<float>)

有没有办法做什么＆＃34; saypx＆＃34;在cuBLAS？

1 个答案: