我是GPU新手,我想用GPU解决大型矩阵向量乘法问题。我试图使用“cublasDgbmv”解决它,因为矩阵是带状矩阵。我尝试在一个简单的例子上实现命令。这是我写的代码:
/* system of equations sol=A*b:
A=[1 2 3 0 0 0
2 -1 4 1 0 0
3 4 5 -1 7 0
0 1 -1 3 8 9
0 0 7 8 2 6
0 0 0 9 6 10]
b=[0 1 2 3 4 5]
solution supposed to be (using matlab) [8 10 39 85 76 101]*/
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include <stdlib.h>
#include <stdio.h>
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
int main()
{
cublasHandle_t handle;
int i, j, k=2, m, n=6; // i,j are used as counters, k is the bandwidth, n is the size of the matrix and m is a constant that will be used in the code
double A[36] = {1,2,3,0,0,0, 2,-1,4,1,0,0, 3,4,5,-1,7,0, 0,1,-1,3,8,9, 0,0,7,8,2,6, 0,0,0,9,6,10};
double* Ab;
Ab = (double*)malloc(n*n*sizeof(double));
double* b;
b = (double*)malloc(n*sizeof(double));
double* sol;
sol = (double*)malloc(n*sizeof(double));
const double alph1 = 1;
const double *alpha_1 = &alph1;
const double alph0 = 0;
const double *alpha_0 = &alph0;
double* d_Ab;
cudaMalloc(&d_Ab, n*n*sizeof(double));
double* d_b;
cudaMalloc(&d_b, n*sizeof(double));
double* d_sol;
cudaMalloc(&d_sol, n*sizeof(double));
for(i=0;i<6;i++)
b[i] = i;
for(j=1;j<=n;j++)
{
m=k+1-j;
for(i=MAX(1,j-k);i<=MIN(n,j+k);i++)
Ab[(m+i-1)*n+(j-1)] = A[(i-1)*n + (j-1)];
}
cudaMemcpy(d_Ab, Ab, n*n*sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(d_b , b, n*sizeof(double), cudaMemcpyHostToDevice);
cublasCreate(&handle);
cublasDgbmv(handle, CUBLAS_OP_N , n, n, k, k, alpha_1, d_Ab, n, d_b, 1, alpha_0, d_sol, 1);
cublasDestroy(handle);
cudaMemcpy(sol, d_sol, n*sizeof(double), cudaMemcpyDeviceToHost);
for(i=0;i<n;i++)
printf("%f ", sol[i]);
free(Ab);
free(b);
free(sol);
cudaFree(d_Ab);
cudaFree(d_b);
cudaFree(d_sol);
}
我的解决方案是:
4.000000 8.000000 33.000000 -31387192811020962000000000000000000000000000000000000000000000000000.000000 -31387192811020962000000000000000000000000000000000000000000000000000.000000 -31387192811020962000000000000000000000000000000000000000000000000000.000000
我知道矩阵应该是带宽形式和列主要形式。我这样做了link
答案 0 :(得分:0)
我知道答案,我正在为可能想要了解它的人发布。
带状矩阵应该按照问题中提到的那样编写......但这是格式,它应该以列形式保存...因此对于上面提到的例子
A=[1 2 3 0 0 0
2 -1 4 1 0 0
3 4 5 -1 7 0
0 1 -1 3 8 9
0 0 7 8 2 6
0 0 0 9 6 10]
在我之前添加的链接中提到的带状表格
A_banded = [3 1 7 9 0 0
2 4 -1 8 6 0
1 -1 5 3 2 10
0 2 4 -1 8 6
0 0 3 1 7 9]
为了以列主要形式书写,它应该以下列形式保存
A_column_major_form = {3,2,1,0,0,0, 1,4,-1,2,0,0, 7,-1,5,4,3,0, 9,8,3,-1,1,0, 0,6,2,8,7,0, 0,0,10,6,9,0}