我遇到了一个小问题。我必须解决一个线性系统A * x = b
。
矩阵A
通过LU分解(LAPACK
)分解。结果我得到了分解矩阵和pivotarray。之后,我想用U * x = y
解决GPU上的两个线性系统:L * y = b
和*cublasDtrsm*
。但由于dgetrf
中LAPACK
的行交换,我必须将枢轴数组传递给cublas
。但是*cublasDtrsm*
- 函数并没有为此提供一些东西。没有枢轴阵列我得到错误的结果。
我已经在LAPACK
中搜索过禁用旋转,但是关于稳定性,它是不可能的。有没有提示如何用LU因子分解求解线性方程组?
答案 0 :(得分:0)
如果你想使用这种特殊方法(在LAPACK getrf之后的cublas trsm),我相信你应该能够通过重新排列你的b向量(或矩阵)以匹配重新排列来使用带有LAP的L,U输出的cublas trsm在旋转期间执行LAPACK的顺序。我相信此订单在LAPACK documentation中的ipiv
公式中给出:
IPIV
IPIV是INTEGER数组,维度(min(M,N)) 枢纽指数;对于1< = i< = min(M,N),第i行 矩阵与行IPIV(i)互换。
这是一个示例代码,演示了使用单个RHS向量的简单3x3测试用例的想法:
$ cat t853.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void LU_device(float *src_d, int n, int *pivot)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float **A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
cudacall(cudaMemcpy(pivot,P,n*batchSize*sizeof(int),cudaMemcpyDeviceToHost));
#ifdef DEBUG_PRINT
for (int qq = 0; qq < n*batchSize; qq++) {printf("pivot[%d] = %d\n", qq, pivot[qq]); }
#endif
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P); cudaFree(INFO); cudaFree(A_d); cublasDestroy(handle);
}
void LU(float* src, float* L, float *U, int n, int *pivot)
{
float *src_d;
cudacall(cudaMalloc<float>(&src_d, n*n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n*n * sizeof(float),cudaMemcpyHostToDevice));
LU_device(src_d,n,pivot);
cudacall(cudaMemcpy(L,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudacall(cudaMemcpy(U,src_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
for (int i = 0; i < n; i ++){
for (int j = 0; j < i; j++) L[i*n+j] = 0.0;
for (int j = i+1; j < n; j++) U[i*n+j] = 0.0;}
cudaFree(src_d);
}
void rearrange(float *vec, int *pivot, int n, int dir){
#define DIR_FORWARD 0
#define DIR_REVERSE 1
#define SWAP(x,y) {float swaptmp=(*(y)); (*(y))=(*(x)); (*(x))=swaptmp;}
if (dir == DIR_FORWARD)
for (int i = 0; i < n; i++) SWAP((vec+i),(vec+pivot[i]-1))
else
for (int i = n-1; i >= 0; i--) SWAP((vec+i),(vec+pivot[i]-1))
}
void TRSM(float *A, float *x, float *b, int n, cublasFillMode_t uplo, cublasDiagType_t diagt ){
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
float *A_d, *b_d;
cudacall(cudaMalloc<float>(&A_d, n*n * sizeof(float)));
cudacall(cudaMalloc<float>(&b_d, n * sizeof(float)));
cudacall(cudaMemcpy(b_d, b, n*sizeof(float), cudaMemcpyHostToDevice));
cudacall(cudaMemcpy(A_d, A, n*n*sizeof(float), cudaMemcpyHostToDevice));
const float alpha = 1.0f;
cublascall(cublasStrsm(handle, CUBLAS_SIDE_LEFT, uplo, CUBLAS_OP_N, diagt, n, 1, &alpha, A_d, n, b_d, n));
cudacall(cudaMemcpy(x, b_d, n*sizeof(float), cudaMemcpyDeviceToHost));
cudaFree(A_d); cudaFree(b_d); cublasDestroy(handle);
}
void test_solve()
{
// solve Ax=b
// 1. Perform LU on A
// 2. using pivot sequence, rearrange b -> b'
// 3. perform TRSM on Ly=b'
// 4. perform TRSM on Ux=y
// A = |0 1 4 |
// |3 3 9 |
// |4 10 16|
// x = |1|
// |2|
// |3|
// b = |14|
// |36|
// |72|
const int n = 3;
// has 3,2,3 pivot order
float A_col_major[n*n] = { 0, 3, 4,
1, 3, 10,
4, 9, 16 };
float b1[n] = {14, 36, 72};
/* another example - has 3,3,3 pivot order
float A_transpose[n*n] = { 0, 1, 4,
3, 3, 9,
4, 10, 16 };
float b2[n] = {18, 37, 70};
*/
float result_x[n];
int pivot[n];
float L[n*n];
float U[n*n];
float y[n];
//Select matrix by setting "a"
float *a = A_col_major;
float *b = b1;
printf("Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",a[i*n+j]);
printf("\n");
}
printf("\n\n");
// 1. LU on A
LU(a,L,U,n,pivot);
#ifdef DEBUG_PRINT
printf("L:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",L[i*n+j]);
printf("\n");
}
printf("\n\n");
printf("U:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
printf("%f\t",U[i*n+j]);
printf("\n");
}
printf("\n\n");
#endif
// 2. Rearrange b
rearrange(b,pivot,n,DIR_FORWARD);
#ifdef DEBUG_PRINT
for (int i = 0; i < n; i++) printf("b'[%d] = %f\n", i, b[i]);
#endif
// 3. TRSM on Ly=b
TRSM(L, y, b, n, CUBLAS_FILL_MODE_LOWER, CUBLAS_DIAG_UNIT);
// 4. TRSM on Ux=y
TRSM(U, result_x, y, n, CUBLAS_FILL_MODE_UPPER, CUBLAS_DIAG_NON_UNIT);
fprintf(stdout, "Solution:\n\n");
for(int i=0; i<n; i++)
{
printf("%f\n",result_x[i]);
}
}
int main()
{
test_solve();
return 0;
}
$ nvcc -o t853 t853.cu -lcublas
$ ./t853
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Solution:
1.000000
2.000000
3.000000
$
请注意,对于这个简单的测试用例,我使用了cublas getrfBatched进行矩阵LU分解,而不是LAPACK,但我认为它应该与LAPACK类似。
另请注意,我不打算评论“线性系统解决方案的最佳方法”,而只是解释您绘制的方法可能如何起作用。
答案 1 :(得分:0)
对于GPU上的置换,可以在给定向量之外创建置换矩阵,并在GPU上将其与B
相乘。实际上,来自LAPACK
的置换向量意味着交换步骤的顺序。因此,如果for-loop触摸了第n行,则永远不会再次触摸它。因此,一个小算法从P
的向量中创建一个置换矩阵*<T>getrf*
。这样就可以解决线性系统L * U * X = P * B
。这导致了正确的结果。
void
permutationMatrix ( int const rows, //number of rows of A
int const cols, //number of cols of A
int* permArray, //permutation vector from LAPACK
double* permMatrix) //Memory for permutation matrix
{
int tempPerm [rows]; //holds where the ones later shall be in the Matrix
int swap; //variable for swapping
memset(permMatrix,0, rows * cols * sizeof(double)); //fill permutation Matrix with 0s
memset(tempPerm,0, rows * sizeof(int)); //fill temporary memory with 0s
for (int row = 0; row < rows; row ++)
{
//start value for each temp field is the row-number
if (tempPerm [row] == 0)
{
tempPerm [row] = row + 1;
}
/* rows need to be swapped if rownumber != number
* in permutation vector of LAPACK*/
if (permArray[row] != row + 1)
{
//swap with a line which hasn't already swapped
if (tempPerm[permArray[row]-1] == 0)
{
tempPerm[permArray[row]-1] = tempPerm[row];
tempPerm[row] = permArray[row];
}else{
//swap with an already touched line
swap = tempPerm[permArray[row]-1];
tempPerm[permArray[row]-1] = tempPerm[row];
tempPerm[row] = swap;
}
}
//put the one in place in the permutation matrix
permMatrix[row + (tempPerm[row]-1) * rows] = 1.0;
}
}