Question

我正在处理的部分代码要求尽可能快地执行Matrix向量乘法，即使用像cublas这样的优化第三方库（尽管同样的原则适用于任何cpu blas）。

问题在于向量中的元素之间存在一种跨度，如下所示：

矩阵存储为3Nx3N 1D浮点阵列。

向量存储为float4s的N 1D数组，但只使用每个float4的前三个元素，第四个应该被忽略。

N大约为数百万。

如果向量存储为float3而不是float4，我可以将指针强制转换为float，就像在这个工作示例中一样：

//Compile with nvcc test.cu -O3 -lcublas -o test

/*
Multiply a 3Nx3N float matrix, M,  by a vector, X, of N float3 elements 

The result, Y, is a 3N float vector
-----------------------

What if X is a vector of N float4?

How can I tell cublas to skip the forth element?

*/

#include<iostream>
#include<thrust/device_vector.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>

using namespace std;

int main(){

  int N = 3;

  thrust::device_vector<float3> X(N);

  thrust::device_vector<float> Y(3*N);

  for(int i=0; i<N; i++) 
     X[i] = make_float3(1,1,1); //make_float4(1,1,1,0); //in the case of float4 i.e., The result should be the same 

  thrust::device_vector<float> M(3*N*3*N, 1);


  cublasHandle_t handle;

  cublasCreate(&handle);

  float beta = 0.0f;
  float alpha = 1.0f;
  cublasSgemv(handle, CUBLAS_OP_T,
          3*N, 3*N,
          &alpha,
          thrust::raw_pointer_cast(&M[0]), 3*N,
          (float*) thrust::raw_pointer_cast(&X[0]), 1,
          &beta,
          thrust::raw_pointer_cast(&Y[0]), 1);

  cout<<"Performed Y = M·X\n\tX = ";
  for(int i=0; i<N; i++){
    float3 Xi = X[i];
    cout<<Xi.x<<" "<<Xi.y<<" "<<Xi.z<<" ";
  }  
  cout<<"\n\tY = ";
  for(int i=0; i<3*N; i++){
    cout<<Y[i]<<" ";
  }
  cout<<endl;

  return 0;
}

但是，如果X向量存储为float4 s？

，如何执行此操作

鉴于float4 *可以被解释为一个浮点*，其元素数量增加了4倍，问题可能会更加普遍（尽管我只对float4情况感兴趣）; 如果每个3“有用”元素之间有一个跨度。我想对cublas说，阵列在记忆中不是合并的。但是类似的东西：开始时有3个元素，接下来的3个是“stride”元素，等等。类似于在OpenGL中使用顶点数组对象所能做的。

修改

答案表明，最可行的方法是将strided数组复制到cublas理解的时间变换float3数组中。

目前这两个选项是：

1. Use cudaMemcpy2D
2. Use a thrust transformation
3. Use a custom copy kernel

我编写了这段代码来测试这三种情况：

//Compile with Compile with: nvcc test.cu -O3 -lcublas -o test
#include<iostream>
#include<thrust/device_vector.h>
#include<cuda.h>
#include<cuda_runtime.h>
#include<cublas_v2.h>

using namespace std;


struct Timer{
  cudaEvent_t start, stop;
  float time;

  void tic(){
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
  }
  float toc(){
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    return time;
  }

};



struct copy_functor{
  copy_functor(){}
  __device__ float3 operator() (const float4& X4){
    return make_float3(X4.x, X4.y, X4.z);
  }
};


__global__ void copy_kernel(const float4* __restrict__ X4, float3* __restrict__ X3, int N){
  int id = blockIdx.x*blockDim.x + threadIdx.x;
  if(id < N){
    float4 x4 = X4[id];
    X3[id] = make_float3(x4.x, x4.y, x4.z);
  }
}

int main(){

  int N = 1000000;
  int Ntest = 1000;

  Timer t;

  thrust::device_vector<float3> X3(N, make_float3(0,0,0));
  thrust::device_vector<float4> X4(N, make_float4(1,1,1,10));


  /*************************CUDAMEMCPY2D*******************/
  t.tic();

  for(int i= 0; i<Ntest; i++){
    cudaMemcpy2DAsync(thrust::raw_pointer_cast(&X3[0]),
              3*sizeof(float),
              thrust::raw_pointer_cast(&X4[0]),
              4*sizeof(float),
              3*sizeof(float),
              N,
              cudaMemcpyDeviceToDevice);
     cudaDeviceSynchronize();
   }
   printf ("Time for cudaMemcpy2DAsync: %f ms\n", t.toc()/(float)Ntest);


   /************************THRUST***********************/
   t.tic();

   for(int i= 0; i<Ntest; i++){
     transform(X4.begin(), X4.end(), X3.begin(), copy_functor());  
     cudaDeviceSynchronize();
   }

   printf ("Time for thrust transformation: %f ms\n", t.toc()/(float)Ntest);

   /*********************COPY KERNEL*****************************/

   t.tic();
   for(int i= 0; i<Ntest; i++){
     copy_kernel<<< N/128 + 1, 128 >>>(thrust::raw_pointer_cast(&X4[0]),
                       thrust::raw_pointer_cast(&X3[0]), N);
     cudaDeviceSynchronize();
   }
   printf ("Time for copy kernel: %f ms\n", t.toc()/(float)Ntest);


return 0;
}

请注意，我的平均值为1000份。

GTX 980中此代码的输出如下：

Time for cudaMemcpy2DAsync: 1.465522 ms
Time for thrust transformation: 0.178745 ms
Time for copy kernel: 0.168507 ms

cudaMemcpy2D比其他人慢一个数量级。

推力和复制内核非常相似且最快的方式

此行为似乎仍保留在任意数量的元素中。

EDIT2：

其他答案表明GEMM可用于传达步幅。无需时间数组。

解释矩阵向量mul。作为矩阵矩阵。会这样做：

 cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_T,
              3*N, 1 /*m*/, 3*N,
              &alpha,
              thrust::raw_pointer_cast(&M[0]), 3*N,
              (float*) thrust::raw_pointer_cast(&X3[0]), 1 /*ldb*/,
              &beta,
              thrust::raw_pointer_cast(&Y[0]), 3*N);

然而，此时，我不知道如何传递X4而不是X3。解决方案似乎在m和ldb参数中。

Answer 1

您可以将1-D float4矢量视为行间距为4的Nx3 2-D浮点矩阵，并使用cudaMemcpy2DAsync将步幅从4更改为3

cudaMemcpy2DAsync(dst,
                  3*sizeof(float),
                  src,
                  4*sizeof(float),
                  3*sizeof(float),
                  N,
                  cudaMemcpyDeviceToDevice);

然后可以将dst视为3N 1-D浮点向量并直接传递给gemv()。

鉴于N的比例，与gemv()相比，复制时间并不明显。

修改

来自@Apo的基准测试结果表明，最好使用复制内核而不是cudaMemcpy2DAsync。我在cudaMemcpy2DAsync上过度期待，并认为它会得到很好的优化，并且在所有情况下都有最好的表现。

使用cublas sgemv时如何跳过float4中的第四个元素？

1 个答案: