Question

我已经运行过example in the CUDA documentation，但是得到了意外的结果。那么如何使用wmma函数呢？我的wmma::load_matrix_sync错误吗？还是我们应该注意的其他事情？...

WMMA_M,WMMA_N,WMMA_K = 16

__global__ void wmma_kernel(half *a, half *b, float *c, int matrix_size)
{
  //Declare the fragment
  wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> a_frag;
  wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,float> acc_frag;

  //Load the matrix to fragment
  wmma::load_matrix_sync(a_frag, a, WMMA_M);
  wmma::load_matrix_sync(b_frag, b, WMMA_K);

  //perform mma
  wmma::fill_fragment(acc_frag, 0.0f);

  for(int i=0; i<1e4; i++)
    {
      wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
    }

  //store the result
  wmma::store_matrix_sync(c, acc_frag, WMMA_M, wmma::mem_row_major);
}

... 我将元素（d_a [i] d_b [i]）的值设置为1.0f，并且c [i] = 0.0f。执行wmma_kernel函数后，c [i]仍为0.0f，elapsedTime也为0.0f。

matrix_size = 16 x 16

      //create the event
  cudaEvent_t start, stop;
  CUDA_CHECK_RETURN(cudaEventCreate(&start));
  CUDA_CHECK_RETURN(cudaEventCreate(&stop));

  //perform the wmma_kernel
  CUDA_CHECK_RETURN(cudaEventRecord(start));
  wmma_kernel<<<1,256>>>(d_a, d_b, d_c, matrix_size);

  CUDA_CHECK_RETURN(cudaEventRecord(stop));
  CUDA_CHECK_RETURN(cudaEventSynchronize(stop));
  //calculate the elapsed time
  float elapsedTime;
  cudaEventElapsedTime(&elapsedTime, start, stop);

  printf("Elapsed Time : %f\n",elapsedTime);

Answer 1

不能直接将值分配给主机上的half变量。

我建议切换到CUDA10。它已使yes数据类型为considerably easier to use。

但是，无论使用CUDA 9.2还是CUDA 10，以下示例都应类似地工作：

half

要获取内核时间度量，可以使用基于$ cat t304.cu #include <mma.h> #include <iostream> using namespace nvcuda; __global__ void wmma_ker(half *a, half *b, float *c) { // Declare the fragments wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag; wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag; wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag; // Initialize the output to zero wmma::fill_fragment(c_frag, 0.0f); // Load the inputs wmma::load_matrix_sync(a_frag, a, 16); wmma::load_matrix_sync(b_frag, b, 16); // Perform the matrix multiplication wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); // Store the output wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major); } int main(){ half *d_a, *h_a, *d_b, *h_b; float *d_c, *h_c; h_c = new float[16*16]; h_b = new half[16*16]; h_a = new half[16*16]; cudaMalloc(&d_a, 16*16*sizeof(half)); cudaMalloc(&d_b, 16*16*sizeof(half)); cudaMalloc(&d_c, 16*16*sizeof(float)); for (int i = 0; i < 16*16; i++) { h_a[i] = 1.0f; h_b[i] = 1.0f;} cudaMemcpy(d_a, h_a, 16*16*sizeof(half), cudaMemcpyHostToDevice); cudaMemcpy(d_b, h_b, 16*16*sizeof(half), cudaMemcpyHostToDevice); wmma_ker<<<1,32>>>(d_a, d_b, d_c); cudaMemcpy(h_c, d_c, 16*16*sizeof(float), cudaMemcpyDeviceToHost); for (int i = 0; i < 16*16; i++) std::cout << h_c[i] << ","; std::cout << std::endl; } $ nvcc -arch=sm_70 -o t304 t304.cu $ cuda-memcheck ./t304 ========= CUDA-MEMCHECK 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16, ========= ERROR SUMMARY: 0 errors $的计时，但是对我来说，仅使用cudaEvent似乎更容易：

nvprof

如何使用WMMA功能？

1 个答案: