我有200个矩阵A [i](其尺寸为4096 * 48),以及48个向量v [j](其尺寸为48 * 1)。我想计算A [i] * v [j],(i = 0:199,j = 1:47)。
我考虑如何从昨天开始排列网格大小和块大小。但我现在还没有找到答案。有人能给我一些建议吗?
每个块的最大数量是512.这是我的工作环境。
以下是我的代码。它运作正常。我检查过。但它比Matlab慢:(
#include<iostream>
#include <mat.h>
#include <time.h>
#include <cuda_runtime.h>
#include "cuda.h"
using std::cout;
using std::endl;
using namespace cv;
using namespace std;
#include <limits>
#include <iostream>
#include <cstdlib>
using namespace std;
#define kernel_size 48
////////////////////////////////////////////
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc(&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(1,B.height);
dim3 dimGrid(A.height, C.width);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockCol = blockIdx.y;
int blockRow = blockIdx.x;
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[1][kernel_size];
__shared__ float Bs[kernel_size][1];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[0][row] = A.elements[blockRow * A.stride + row+B.height*blockCol];
Bs[row][0] = B.elements[row];
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < B.height; ++e)
{
Cvalue += As[0][e] * Bs[e][0];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
// Write Csub to device memory
// Each thread writes one element
C.elements[blockRow * C.stride +blockCol]= Cvalue;
}
//////////////////
float * gen_matrix(int n /*row*/, int m /*col*/){
float *A;
//srand(1023);
A = (float *) malloc(n*m*sizeof(float));
for(int row = 0;row < n;row++)
for(int col = 0;col < m;col++) {
A[row*m+col] = rand()%10;
}
/*
// print matrix elements.
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j)
cout << " [" << i << "," << j << "] " << A[i*m+j] ;
cout << endl;
}
*/
return A;
}
int main()
{
int k=kernel_size;
int s=2000;
int m =4096;
//int m=2;
//int s=1;
int n = k*s;
float *Ae = gen_matrix(m,n);
float *Be= gen_matrix(k,1);00
float *Ce=(float *) malloc(m*s*sizeof(float));
Matrix A ={n,m,n,Ae};
Matrix B ={1,k,1,Be};
Matrix C ={s,m,s,Ce};
const clock_t begin_time = clock();
MatMul(A, B, C);
std::cout << float( clock () - begin_time ) / CLOCKS_PER_SEC;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j <7; ++j)
cout << " [" << i << "," << j << "] " << Ce[i*m+j] ;
cout << endl;
}
//check
float *Ce2=(float *) malloc(s*m*sizeof(float));
for (int i = 0; i < m; i++)
{
for (int j = 0; j < s; j++)
{
Ce2[i*s+j]=0;
}
}
for (int i = 0; i < m; i++)
{
for (int j = 0; j < s; j++)
{
for (int ind = 0; ind < k; ind++)
{
Ce2[i*s+j]=Ce2[i*s+j]+Ae[j*k+ind+i*k*s]*Be[ind];
// printf("%f---****%f\n",Ae[j*k+ind+i*k*s],Be[ind]);
}
if (Ce2[i*s+j]!= Ce[i*s+j])
{
printf("%f----%f\n",Ce2[i*s+j],Ce[i*s+j]);
}
}
}
free(Ae);
free(Be);
free(Ce);
}
答案 0 :(得分:1)
这只是一个矩阵 - 矩阵乘法问题。如果你想让事情快速运行,你不应该编写自己的矩阵 - 矩阵乘法码。使用CUBLAS Sgemm。
从概念上讲,如果您安排这样的A
矩阵:
[A0]
[A1]
[A2]
...
[A199]
然后你会得到一个新的矩阵AA
,即(4096 * 200)行x 48列。
将48 V
个向量(48x1)排列在48x48矩阵(VV
)中:
[V0][V1][V2]...[V47]
(每个V
向量是新矩阵VV
的列)
您现在有一个矩阵乘法问题(AA
* VV
),即(4096 * 200)x48乘以48x48,得到(4096 * 200)x 48结果。此结果有一个长度为4096 * 200的列向量,其中包含您尝试执行的各个矩阵向量乘法的200个结果。每列* 48列的200个结果组合在一起,为您提供原始问题将创建的所有结果。第一列将包含[V0]
的结果乘以200 A
矩阵中的每一个,第二列将包含[V1]
的结果乘以200 A
中的每一个矩阵等。
一旦您安排了这样的数据,使用CUBLAS Sgemm应该是GPU上最快的方法。请注意,CUBLAS期望底层存储是列专用的,因此如果要重新排列数据,您可能需要牢记这一点。有CUDA sample code for CUBLAS matrix multiplication。
在您的代码中,您显示实际上有2000 A
个矩阵,但您的问题是200.我在答案中使用了200,但概念与2000 A
的概念相同矩阵。