以下内核将两个n×n矩阵相乘:
__global__ void matrixMultiplication(const double *A, const double *B, double *C, int N)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
double value = 0;
for(int k = 0; k < N; k++){
value += A[k * N + j] * B[i * N + k];
}
C[i * N + j] = value;
}
我在MATLAB中使用上述内核,如下所示:
k = parallel.gpu.CUDAKernel('matrixMultiplication.ptx', 'matrixMultiplication.cu');
A = rand(3,4);
b = rand(4,1);
C = zeros(3,1);
k.ThreadBlockSize = [3,4,1];
k.GridSize = [1, 1];
D = A*b;
C = feval(k,A,b,C,4);
D-C
但是结果不为零!如何更改此内核,以便可以在n×1向量中乘以m×n矩阵?
答案 0 :(得分:1)
由于发生的情况是您正在使用Matlab(并且数组以列的主要顺序存储),因此您将不得不对内核代码进行非常小的修改:
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <iostream>
__global__ void matrixMultiplication(const double *A, const double *B, double *C, int M, int N)
{
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
double value = 0;
for(int k = 0; k < N; k++){
value += A[k * M + j] * B[i * M + k];
}
C[i * N + j] = value;
}
int main()
{
const int M = 3, N = 4, K = 1;
thrust::device_vector<double> A(M*N), B(N*K), C(M*K);
thrust::counting_iterator<double> counter(1.0);
thrust::copy(counter, counter + (M*N), A.begin());
thrust::copy(counter, counter + (N*K), B.begin());
dim3 grid(1,1), block(M,K);
matrixMultiplication<<<grid, block>>>( thrust::raw_pointer_cast(A.data()),
thrust::raw_pointer_cast(B.data()),
thrust::raw_pointer_cast(C.data()),
M, N );
cudaDeviceSynchronize();
for(int i=0; i<M*K; i++)
std::cout << C[i] << std::endl;
return 0;
}
内核在输出矩阵中每个条目需要一个线程,因此您需要在您的(3 x 4)*(4 x 1)示例中运行3 x 1个线程。运行时,您应该看到以下内容:
$ nvcc -arch=sm_52 -std=c++11 -o spoonfull spoonfull.cu
$ ./spoonfull
70
80
90
这对列主要订单存储是正确的。