我正在尝试Cuda编程。作为其中的一部分,我试图开发一种在GPU上运行的矩阵乘法算法。该算法适用于方形矩阵,但对于非方形矩阵则失败。 这是我的内核
float* multiply_gpu(float* matrix1 , float* matrix2);
__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a,
float *b, float *result) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int result_size = rowsA*columnsB;
int value = 0;//the final result
//indices of values from input matrices
if (index < result_size) {
int index1 = (index/rowsA)*rowsA; //get nearest row
int index2 = index%columnsB; //get start column
int k = 0;
while (k<columnsA) { //columnsA == rowsB
value += a[index1]*b[index2]; //v = sum a_ik * b_kj
index1 ++;
index2 += columnsB;
k++;
}
result[index] = value;
}
}
在与我的主管进行简短的健全检查后,他没有看到任何问题。 我认为问题在于这个功能:
float* multiply_gpu(float* matrix1 , float* matrix2) {
//the dimensions of the matrices
size_t available, total;
cudaError_t error;
cudaError err = cudaMemGetInfo(&available, &total);
if(err != cudaSuccess){
printf("There was an error: %s\n", cudaGetErrorString(err));
}
int height1 = matrix1[0];
int width1 = matrix1[1];
int height2 = matrix2[0];
int width2 = matrix2[1];
if (width1!=height2) {
return NULL;
}
//this array contains the result of the operation
float* result = (float *) malloc(height1*width2*sizeof(float));
//pointers for device matrices
float *d_matrix1;
float *d_matrix2;
float *d_result;
//allocate memory for matrices
error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//now copy matrices onto device -- note the offset of 2
error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
printf("%d %d %d %d\n",height1,width1,height2,width2);
//make the host block until mult is finished running
//printf("finished multiplying\n");
cudaDeviceSynchronize();
//copy result back
error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//free now unneeded cuda memory
cudaFree(d_matrix1);
cudaFree(d_matrix2);
cudaFree(d_result);
printf("GOT RESULT\n");
for (int i=0;i<height1*width2;i++) {
printf("%f ",result[i]);
}
printf("\n");
//result ready to be returned
return result;
}
请注意,作为multiply_gpu参数的矩阵的高度为索引0,宽度为索引1.结果矩阵没有此信息。
错误计算的一个例子: 当我将以下数组提供给multiply_gpu - {2,3, 1,2,3,4,5,6 },{3,2, 1,2,3,4 ,5,6 }答案应该是{22,28,49,64},而是我的单元测试生成{22,28,40,52}。很近!请注意,对于(1,2,3)*(1,2,3)(不是正方形)的点积,算法很高兴......这里的错误是什么?谢谢你的帮助。如果我独立找到一个解决方案,我会发布解决方案。
答案 0 :(得分:1)
这一行错了:
int index1 = (index/rowsA)*rowsA; //get nearest row
它应该是这样的:
int index1 = (index/columnsB)*columnsA; //get nearest row
为什么这个配方正确? index1
用于索引A
中与我们正在计算的输出矩阵位置所指示的行对应的行元素。输出矩阵位置只是线程索引。如果我们(整数)将线程索引除以输出矩阵中的列的数量,即C
,我们得到有问题的行号。然后,要在A
中找到该行的第一个元素,我们乘以A
中的列数。这正确地将我们索引到A
中相关行的第一个元素。
这是一个完整的应用程序以及我的测试用例 - 我对您的代码所做的唯一更改是上面指出的更改。
$ cat t290.cu
#include <stdio.h>
__global__ void mult(int rowsA , int columnsA, int rowsB,int columnsB, float *a, float *b, float *result) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int result_size = rowsA*columnsB;
int value = 0;//the final result
//indices of values from input matrices
if (index < result_size) {
int index1 = (index/columnsB)*columnsA; //get nearest row
int index2 = index%columnsB; //get start column
int k = 0;
while (k<columnsA) { //columnsA == rowsB
value += a[index1]*b[index2]; //v = sum a_ik * b_kj
index1 ++;
index2 += columnsB;
k++;
}
result[index] = value;
}
}
float* multiply_gpu(float* matrix1 , float* matrix2) {
//the dimensions of the matrices
size_t available, total;
cudaError_t error;
cudaError err = cudaMemGetInfo(&available, &total);
if(err != cudaSuccess){
printf("There was an error: %s\n", cudaGetErrorString(err));
}
int height1 = matrix1[0];
int width1 = matrix1[1];
int height2 = matrix2[0];
int width2 = matrix2[1];
if (width1!=height2) {
printf("fail!\n");
return NULL;
}
//this array contains the result of the operation
float* result = (float *) malloc(height1*width2*sizeof(float));
//pointers for device matrices
float *d_matrix1;
float *d_matrix2;
float *d_result;
//allocate memory for matrices
error = cudaMalloc((void **)&d_matrix1,(size_t)height1*width1*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_matrix2,height2*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMalloc((void **)&d_result,height1*width2*sizeof(float));
if (error != cudaSuccess) {
fprintf(stderr, "Failed to allocate memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//now copy matrices onto device -- note the offset of 2
error = cudaMemcpy(d_matrix1 , matrix1+2 , height1*width1*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
error = cudaMemcpy(d_matrix2 , matrix2+2 , height2*width2*sizeof(float), cudaMemcpyHostToDevice);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//launch multiplication kernel
//note I have tried adjusting the kernel values between <<< , >>> to no avail
mult<<<height1,width2>>>(height1,width1,height2,width2,d_matrix1,d_matrix2,d_result);
printf("%d %d %d %d\n",height1,width1,height2,width2);
error = cudaGetLastError();
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//make the host block until mult is finished running
//printf("finished multiplying\n");
error = cudaDeviceSynchronize();
if (error != cudaSuccess) {
fprintf(stderr, "kernel fail (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//copy result back
error = cudaMemcpy(result,d_result,height1*width2*sizeof(float),cudaMemcpyDeviceToHost);
if (error != cudaSuccess) {
fprintf(stderr, "Failed to copy memory (error code %s)!\n", cudaGetErrorString(error));
exit(EXIT_FAILURE);
}
//free now unneeded cuda memory
cudaFree(d_matrix1);
cudaFree(d_matrix2);
cudaFree(d_result);
printf("GOT RESULT\n");
for (int i=0;i<height1*width2;i++) {
printf("%f ",result[i]);
}
printf("\n");
//result ready to be returned
return result;
}
int main(){
float m1[8] = {2.0, 3.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
float m2[6] = {2.0, 2.0, 1.0, 1.0, 2.0, 2.0};
float *my_result1 = multiply_gpu(m2, m1);
float m3[8] = {2,3,1,2,3,4,5,6};
float m4[8] = {3,2,1,2,3,4,5,6};
float *my_result2 = multiply_gpu(m3, m4);
float *my_result3 = multiply_gpu(m4, m3);
float m5[12] = {2,5,1,1,1,1,1,1,1,1,1,1};
float m6[22] = {5,4,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1};
float *my_result4 = multiply_gpu(m5, m6);
return 0;
}
$ nvcc -arch=sm_20 -o t290 t290.cu
t290.cu: In function âfloat* multiply_gpu(float*, float*)â:
t290.cu:30: warning: converting to âintâ from âfloatâ
t290.cu:31: warning: converting to âintâ from âfloatâ
t290.cu:32: warning: converting to âintâ from âfloatâ
t290.cu:33: warning: converting to âintâ from âfloatâ
$ cuda-memcheck ./t290
========= CUDA-MEMCHECK
2 2 2 3
GOT RESULT
5.000000 7.000000 9.000000 10.000000 14.000000 18.000000
2 3 3 2
GOT RESULT
22.000000 28.000000 49.000000 64.000000
3 2 2 3
GOT RESULT
9.000000 12.000000 15.000000 19.000000 26.000000 33.000000 29.000000 40.000000 51.000000
2 5 5 4
GOT RESULT
5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000
========= ERROR SUMMARY: 0 errors
$
答案 1 :(得分:0)
因此,在仔细检查我的矩阵代码后,我发现了一个简单的问题 我操作的数学。
这条线路确实是错误的
int index1 = (index/rowsA)*rowsA; //get nearest row
我注意到由于我的矩阵是按行排序的,因此从(i,j)元素获取正确索引的公式是
index = i*rowLength + j
因此,对index1的赋值应为
int index1 = (index/rowsA)*columnsA
为什么呢?很明显,要导航到行 n 的索引,我们必须移动 n 行长度(这是矩阵中的列数)。我的代码适用于方形矩阵,但不适用于其他矩形矩阵,因为列数与这种矩阵中的行数不匹配。