对于尺寸为4x3和2x3的两个矩阵X和Q. 在记忆中看起来像
x = [0 1 2 3 4 5 6 7 8 9 10 11]
q = [3 4 5 6 7 8]
我尝试使用cublas乘法cublasSgemm,但我无法获得预期的结果。
因为它们以行主要顺序存储所以它们应该被解释为3x4和3x2所以对我来说似乎
cublasSgemm(cublas_handle,
CUBLAS_OP_T, CUBLAS_OP_N,
q_rows_num, x_rows_num, dim,
&alpha, // 1
q_device, q_rows_num,
x, x_rows_num,
&beta, // 0
x_q_multiplication, q_rows_num);
,其中
dim = 3
x_rows_num = 4
q_rows_num = 2
会起作用,但在那种情况下我得到了错误
** On entry to SGEMM parameter number 8 had an illegal value
我也尝试过稍微改组参数,但我找不到任何可行的设置。
那么可以在不改变列主要顺序的情况下将它们相乘吗?
修改
所以我通过这个工作示例所做的更改获得了预期的结果:
#include <cublas_v2.h>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
int main()
{
int x_rows_num = 4;
int q_rows_num = 2;
int dim = 3;
int N = x_rows_num*dim;
int M = q_rows_num*dim;
float *x, *q, *x_q_multiplication;
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&q, M*sizeof(float));
cudaMallocManaged(&x_q_multiplication, q_rows_num*x_rows_num*dim);
for (int i = 0; i< N; i++) x[i] = i*1.0f;
for (int i = 0; i< M; i++) q[i] = (i + 3)*1.0f;
float *q_device;
cudaMallocManaged(&q_device, M*sizeof(float));
cudaMemcpy(q_device, q, M*sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.f;
float beta = 0.f;
cublasSgemm(handle,
CUBLAS_OP_T, CUBLAS_OP_N,
x_rows_num, q_rows_num, dim,
&alpha,
x, dim,
q, dim,
&beta,
x_q_multiplication, x_rows_num);
cudaDeviceSynchronize();
for (int i = 0; i < q_rows_num*x_rows_num; i++) std::cout << x_q_multiplication[i] << " ";
cudaFree(x);
cudaFree(q);
cudaFree(x_q_multiplication);
return 0;
}
然而,我仍然不确定为什么昏暗成为领导维度
答案 0 :(得分:1)
原来的CUBLAS电话:
cublasSgemm(cublas_handle,
CUBLAS_OP_T, CUBLAS_OP_N,
q_rows_num, x_rows_num, dim,
&alpha, // 1
q_device, q_rows_num,
x, x_rows_num,
&beta, // 0
x_q_multiplication, q_rows_num);
接近正确。您对主要尺寸应该是什么的解释是正确的。你错了的是Op
说明符。如果两个矩阵都是行主要排序并且第一个数组需要以其(行主要)转置顺序读取,那么操作应该是:
#include <cublas_v2.h>
#include <cstring>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
int main()
{
int x_rows_num = 4;
int q_rows_num = 2;
int dim = 3;
int N = x_rows_num*dim;
int M = q_rows_num*dim;
float x0[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
float q0[6] = {3, 4, 5, 6, 7, 8 };
float *x, *q, *x_q_multiplication;
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&q, M*sizeof(float));
cudaMallocManaged(&x_q_multiplication, q_rows_num*x_rows_num*dim);
std::memcpy(x, x0, N*sizeof(float));
std::memcpy(q, q0, M*sizeof(float));
float *q_device;
cudaMallocManaged(&q_device, M*sizeof(float));
cudaMemcpy(q_device, q, M*sizeof(float), cudaMemcpyHostToDevice);
cublasHandle_t handle;
cublasCreate(&handle);
float alpha = 1.f;
float beta = 0.f;
cublasSgemm(handle,
CUBLAS_OP_N, CUBLAS_OP_T,
q_rows_num, x_rows_num, dim,
&alpha, // 1
q_device, q_rows_num,
x, x_rows_num,
&beta, // 0
x_q_multiplication, q_rows_num);
cudaDeviceSynchronize();
for (int i = 0; i < q_rows_num*x_rows_num; i++) std::cout << x_q_multiplication[i] << " "; std::cout << std::endl;
cudaFree(x);
cudaFree(q);
cudaFree(x_q_multiplication);
return 0;
}
这对我来说是这样的:
$ nvcc -arch=sm_52 cublas_trans.cu -o cublas_trans -lcublas
$ ./cublas_trans
76 88 91 106 106 124 121 142
我相信这是正确答案。
顺便提一下,Robert Crovella现在删除了评论,你说你冒犯了100%正确。我怀疑他像我一样读了你原来的CUBLAS调用,解释了这些论点并得出结论,正如我所做的那样,而且正如CUBLAS本身所做的那样,你试图将3x4矩阵和3x2矩阵相乘。这就是引发无效参数错误的原因。