我已经运行过example in the CUDA documentation,但是得到了意外的结果。那么如何使用wmma函数呢?我的wmma::load_matrix_sync
错误吗?还是我们应该注意的其他事情?...
WMMA_M,WMMA_N,WMMA_K = 16
__global__ void wmma_kernel(half *a, half *b, float *c, int matrix_size)
{
//Declare the fragment
wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, half, wmma::col_major> a_frag;
wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K,float> acc_frag;
//Load the matrix to fragment
wmma::load_matrix_sync(a_frag, a, WMMA_M);
wmma::load_matrix_sync(b_frag, b, WMMA_K);
//perform mma
wmma::fill_fragment(acc_frag, 0.0f);
for(int i=0; i<1e4; i++)
{
wmma::mma_sync(acc_frag, a_frag, b_frag, acc_frag);
}
//store the result
wmma::store_matrix_sync(c, acc_frag, WMMA_M, wmma::mem_row_major);
}
... 我将元素(d_a [i] d_b [i])的值设置为1.0f,并且c [i] = 0.0f。 执行wmma_kernel函数后,c [i]仍为0.0f,elapsedTime也为0.0f。
matrix_size = 16 x 16
//create the event
cudaEvent_t start, stop;
CUDA_CHECK_RETURN(cudaEventCreate(&start));
CUDA_CHECK_RETURN(cudaEventCreate(&stop));
//perform the wmma_kernel
CUDA_CHECK_RETURN(cudaEventRecord(start));
wmma_kernel<<<1,256>>>(d_a, d_b, d_c, matrix_size);
CUDA_CHECK_RETURN(cudaEventRecord(stop));
CUDA_CHECK_RETURN(cudaEventSynchronize(stop));
//calculate the elapsed time
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Elapsed Time : %f\n",elapsedTime);
答案 0 :(得分:1)
不能直接将值分配给主机上的half变量。
我建议切换到CUDA10。它已使yes
数据类型为considerably easier to use。
但是,无论使用CUDA 9.2还是CUDA 10,以下示例都应类似地工作:
half
要获取内核时间度量,可以使用基于$ cat t304.cu
#include <mma.h>
#include <iostream>
using namespace nvcuda;
__global__ void wmma_ker(half *a, half *b, float *c) {
// Declare the fragments
wmma::fragment<wmma::matrix_a, 16, 16, 16, half, wmma::col_major> a_frag;
wmma::fragment<wmma::matrix_b, 16, 16, 16, half, wmma::row_major> b_frag;
wmma::fragment<wmma::accumulator, 16, 16, 16, float> c_frag;
// Initialize the output to zero
wmma::fill_fragment(c_frag, 0.0f);
// Load the inputs
wmma::load_matrix_sync(a_frag, a, 16);
wmma::load_matrix_sync(b_frag, b, 16);
// Perform the matrix multiplication
wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
// Store the output
wmma::store_matrix_sync(c, c_frag, 16, wmma::mem_row_major);
}
int main(){
half *d_a, *h_a, *d_b, *h_b;
float *d_c, *h_c;
h_c = new float[16*16];
h_b = new half[16*16];
h_a = new half[16*16];
cudaMalloc(&d_a, 16*16*sizeof(half));
cudaMalloc(&d_b, 16*16*sizeof(half));
cudaMalloc(&d_c, 16*16*sizeof(float));
for (int i = 0; i < 16*16; i++) {
h_a[i] = 1.0f;
h_b[i] = 1.0f;}
cudaMemcpy(d_a, h_a, 16*16*sizeof(half), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, 16*16*sizeof(half), cudaMemcpyHostToDevice);
wmma_ker<<<1,32>>>(d_a, d_b, d_c);
cudaMemcpy(h_c, d_c, 16*16*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < 16*16; i++) std::cout << h_c[i] << ",";
std::cout << std::endl;
}
$ nvcc -arch=sm_70 -o t304 t304.cu
$ cuda-memcheck ./t304
========= CUDA-MEMCHECK
16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
========= ERROR SUMMARY: 0 errors
$
的计时,但是对我来说,仅使用cudaEvent
似乎更容易:
nvprof