我正在以两种不同的方式执行QR分解:使用标准numpy方法并使用CULA库中实现的GEQRF LAPACK函数。这是python中的简单示例(用于访问CULA的PyCULA):
from PyCULA.cula import culaInitialize,culaShutdown
from PyCULA.cula import gpu_geqrf, gpu_orgqr
import numpy as np
import sys
def test_numpy(A):
Q, R = np.linalg.qr(A)
print "Q"
print Q
print "R"
print R
print "transpose(Q)*Q"
print np.dot(np.transpose(Q), Q)
print "Q*R"
print np.dot(Q,R)
def test_cula(A):
culaInitialize()
QR, TAU = gpu_geqrf(A)
R = np.triu(QR)
Q = gpu_orgqr(QR, A.shape[0], TAU)
culaShutdown()
print "Q"
print Q
print "R"
print R
print "transpose(Q)*Q"
print np.dot(np.transpose(Q), Q)
print "Q*R"
print np.dot(Q,R)
def main():
rows = int(sys.argv[1])
cols = int(sys.argv[2])
A = np.array(np.ones((rows,cols)).astype(np.float64))
print "A"
print A
print "NUMPY"
test_numpy(A.copy())
print "CULA"
test_cula(A.copy())
if __name__ == '__main__':
main()
它产生以下输出:
A
[[ 1. 1. 1.]
[ 1. 1. 1.]
[ 1. 1. 1.]]
NUMPY
Q
[[-0.57735027 -0.57735027 -0.57735027]
[-0.57735027 0.78867513 -0.21132487]
[-0.57735027 -0.21132487 0.78867513]]
R
[[-1.73205081 -1.73205081 -1.73205081]
[ 0. 0. 0. ]
[ 0. 0. 0. ]]
transpose(Q)*Q
[[ 1.00000000e+00 2.77555756e-17 0.00000000e+00]
[ 2.77555756e-17 1.00000000e+00 0.00000000e+00]
[ 0.00000000e+00 0.00000000e+00 1.00000000e+00]]
Q*R
[[ 1. 1. 1.]
[ 1. 1. 1.]
[ 1. 1. 1.]]
CULA
Q
[[-0.57735027 -0.57735027 -0.57735027]
[-0.57735027 0.78867513 -0.21132487]
[-0.57735027 -0.21132487 0.78867513]]
R
[[-1.73205081 0.3660254 0.3660254 ]
[-0. 0. 0. ]
[-0. 0. 0. ]]
transpose(Q)*Q
[[ 1.00000000e+00 2.77555756e-17 0.00000000e+00]
[ 2.77555756e-17 1.00000000e+00 0.00000000e+00]
[ 0.00000000e+00 0.00000000e+00 1.00000000e+00]]
Q*R
[[ 1. -0.21132487 -0.21132487]
[ 1. -0.21132487 -0.21132487]
[ 1. -0.21132487 -0.21132487]]
我的代码出了什么问题?
答案 0 :(得分:1)
我在R.测试了你的例子.CULA似乎提供了与R相同的结果。这是我的代码:
#include <Rcpp.h>
#include <cula.h>
// [[Rcpp::export]]
std::vector< float > gpuQR_cula( std::vector< float > x, const uint32_t nRows, const uint32_t nCols )
{
std::vector< float > tau( nCols ) ;
culaInitialize() ;
culaSgeqrf( nRows, nCols, &x.front(), nRows, &tau.front() ) ;
culaShutdown() ;
Rcpp::Rcout << "Tau: " << tau[ 0 ] << ", " << tau[ 1 ] << ", " << tau[ 2 ] << "\n" ;
for( uint32_t jj = 0 ; jj < nCols ; ++jj ) {
for( uint32_t ii = 1 ; ii < nRows ; ++ii ) {
if( ii > jj ) { x[ ii + jj * nRows ] *= tau[ jj ] ; }
}
}
return x ;
}
你的矩阵:
(A <- matrix(1, 3, 3))
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 1 1 1
[3,] 1 1 1
n_row <- nrow(A)
n_col <- ncol(A)
以下是CULA的结果:
matrix(gpuQR_cula(c(A), n_row, n_col), n_row, n_col)
Tau: 1.57735, 0, 0
[,1] [,2] [,3]
[1,] -1.7320509 -1.732051 -1.732051
[2,] 0.5773503 0.000000 0.000000
[3,] 0.5773503 0.000000 0.000000
以下是R的结果:
(qrA <- qr(A))
$qr
[,1] [,2] [,3]
[1,] -1.7320508 -1.732051 -1.732051
[2,] 0.5773503 0.000000 0.000000
[3,] 0.5773503 0.000000 0.000000
$qraux
[1] 1.57735 0.00000 0.00000
Q <- qr.Q(qrA)
R <- qr.R(qrA)
crossprod(Q)
[,1] [,2] [,3]
[1,] 1.000000e+00 4.163336e-17 5.551115e-17
[2,] 4.163336e-17 1.000000e+00 0.000000e+00
[3,] 5.551115e-17 0.000000e+00 1.000000e+00
Q %*% R
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 1 1 1
[3,] 1 1 1
我希望有所帮助!
答案 1 :(得分:1)
这是一个棘手的问题,这里的问题是Python使用Row-major顺序,但CULA使用Column-major顺序作为R。请查看CULA文档以获取更多详细信息。
这里是scikit-cuda的例子:
import numpy as np
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
from skcuda import linalg
linalg.init()
# skcuda
A = np.ones( (3,3) )
A_gpu = gpuarray.to_gpu(np.array(A, order='F'))
Q , R = linalg.qr(A_gpu)
Q, R = Q.get(), R.get()
print Q.dot(R) #recovers A
[[ 1. 1. 1.]
[ 1. 1. 1.]
[ 1. 1. 1.]]
print Q.T.dot(Q) # As expected
[[ 1.00000000e+00 -5.55111512e-17 1.11022302e-16]
[ -5.55111512e-17 1.00000000e+00 -2.22044605e-16]
[ 1.11022302e-16 -2.22044605e-16 1.00000000e+00]]
如果你改用(这是Python中的默认值)
A_gpu = gpuarray.to_gpu(np.array(A, order='C'))
您将得到与上面发布的相同的错误结果。
此问题可能会导致一些问题,因此您必须非常小心并处理矩阵顺序。
干杯, 本