我没有找到任何类似的问题。 我正在尝试编写高斯 - 约旦逆矩阵算法。算法的想法很简单:)
我想只反转下三角矩阵。我得到了几乎正确的答案。我在哪里做错了?有人能帮帮我吗?我会很感激的。
n 一个方向的矩阵大小(n%16 = 0)
dim3 threadsPerBlock(n / 16,n / 16);
我知道这是一个简单的实现,但起初我需要它才能正常工作:) 有没有人可以帮助我或给我任何暗示? 我会很感激的。非常感谢!
有整个cpu代码:
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
using namespace std;
__global__ void gaussjordan(float *A, float *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
float P;
if(x<n && y<n)
if(x>i)
if(y>=i){
P=A[x*n+i]/A[i*n+i];
I[x*n+y]-= I[i*n+y]*P;
A[x*n+y]-= A[i*n+y]*P;
}
__syncthreads();
}
__global__ void dev(float *d_A, float *dI, int h)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x<h && y<h)
if(d_A[x*h+x]!=0){
dI[x*h+y] /= d_A[x*h+x];
d_A[x*h+y] /= d_A[x*h+x];
}
__syncthreads();
}
void savetofile(float *A, string s, int n, int h)
{
std::ofstream plik;
plik.open(s);
for(int j=0;j<h;j++){
for(int i=0;i<h;i++){
plik<<A[j*n+i]<<"\t";}
plik<<endl;}
plik.close();
}
int main()
{
int n = 16;
// creating input
float *iL = new float [n*n];
float *L = new float [n*n];
for(int i=0;i<n;i++)
for(int j=0;j<n;j++)
if(i==j || i>j) L[i*n+j] = (i*n+j+1)*(i*n+j+1)*0.007 + (i*n+j+1)*0.01 -(i*n+j+1)*(i*n+j+1)*(i*n+j+1)*0.0005;
else L[i*n+j] = 0;
savetofile(L,"L.txt",n,n);
cout<<"inv\n";
float *d_A, *d_L,*I, *dI;
float time;
cudaError_t err;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
int ddsize = n*n*sizeof(float);
dim3 threadsPerBlock(n/16,n/16);
dim3 numBlocks(16,16);
// memory allocation
err= cudaMalloc( (void**) &d_A, ddsize); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
err= cudaMalloc( (void**) &dI, ddsize); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
I = new float[n*n];
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
if(i==j) I[i*n+i]=1.0;
else I[i*n+j]=0.0;}}
//copy data from GPU to CPU
err =cudaMemcpy( d_A, L, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
err =cudaMemcpy( dI, I, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
//timer start
cudaEventRecord( start, 0);
// L^(-1)
for(int i=0;i<n;i++){
gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
}
dev<<<numBlocks, threadsPerBlock>>>(d_A, dI, n);
err = cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
err = cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
cudaEventElapsedTime( &time, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
savetofile(iL,"inv.txt",n,n);
savetofile(L,"I.txt",n,n);
cudaFree(d_A);
cudaFree(dI);
delete []I;
delete []L;
delete []iL;
system("Pause");
return 0;
}
有我的输出:
60.6061 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-34.1233 -2.13675 -0 -0 -0 -0 -0 -0 0 0 0 0 0 0 0 0
-48.5115 1.91603 -0.0799201 -0 -0 -0 -0 -0 0 0 0 0 0 0 0 0
-49.4891 1.8697 0.0748167 -0.0196634 -0 -0 -0 -0 0 0 0 0 0 0 0 0
-49.8332 1.84732 0.0725876 0.018747 -0.00767828 -0 -0 -0 0 0 0 0 0 0 0 0
-50.0073 1.83403 0.071321 0.0182352 0.00739595 -0.00376795 -0 -0 0 0 0 0 0 0 0 0
-50.112 1.82521 0.0705011 0.0179073 0.0072164 0.00365346 -0.00212282 -0 0 0 0 0 0 0 0 0
-50.1818 1.81893 0.0699261 0.0176789 0.00709196 0.00357445 0.00206784 -0.00131234 0 0 0 0 0 0 0 0
-50.2316 1.81423 0.0695003 0.0175105 0.00700059 0.00351662 0.0020277 0.00128271 -0.00086736 -0 -0 -0 -0 -0 -0 -0
-50.2689 1.81057 0.0691722 0.0173813 0.00693062 0.00347244 0.00199711 0.00126017 0.000850006 -0.000602925 -0 -0 -0 -0 -0 -0
-50.2979 1.80765 0.0689115 0.0172789 0.0068753 0.00343758 0.00197301 0.00124245 0.000836382 0.000592093 -0.000435975 -0 -0 -0 -0 -0
-50.321 1.80527 0.0686993 0.0171957 0.00683047 0.00340937 0.00195354 0.00122815 0.000825401 0.000583374 0.000428868 -0.00032541 -0 -0 -0 -0
-50.34 1.80328 0.0685233 0.0171269 0.0067934 0.00338607 0.00193748 0.00121637 0.000816362 0.000576204 0.000423029 0.000320554 -0.000249293 -0 -0 -0
-50.3557 1.80159 0.0683749 0.0170689 0.00676223 0.0033665 0.001924 0.00120649 0.000808792 0.000570204 0.000418147 0.000316498 0.000245864 -0.000195186 -0 -0
-50.369 1.80015 0.0682481 0.0170195 0.00673566 0.00334983 0.00191253 0.00119809 0.000802358 0.000565109 0.000414005 0.000313058 0.000242958 0.000192695 -0.000155673 -0
-50.3805 1.7989 0.0681385 0.0169768 0.00671274 0.00333547 0.00190265 0.00119086 0.000796824 0.000560729 0.000410446 0.000310105 0.000240465 0.000190559 0.00015382 -0.000126146
应该是:
60,6060606060606 4,44089209850063e-16 4,85722573273506e-17 -3,12250225675825e-17 0 1,73472347597681e-18 -1,08420217248550e-18 -7,58941520739853e-19 4,33680868994202e-19 -5,42101086242752e-19 0 -6,93889390390723e-18 0 -1,38777878078145e-17 0 1,18720137887163e-17
-34,1232841232841 -2,13675213675214 0 8,67361737988404e-18 3,03576608295941e-18 8,67361737988404e-19 -1,73472347597681e-18 1,35525271560688e-18 -8,67361737988404e-19 1,00288700954909e-18 0 0 6,93889390390723e-18 6,93889390390723e-18 -1,38777878078145e-17 3,02221355580334e-18
-17,9130271437964 1,91603268526345 -0,0799200799200800 1,30104260698261e-18 1,95156391047391e-18 -9,75781955236954e-19 1,95156391047391e-18 2,16840434497101e-19 -3,52365706057789e-19 -1,62630325872826e-19 1,38777878078145e-17 -3,46944695195361e-18 0 0 0 -2,72405795836983e-18
-2,86140643299924 0,0760191125748172 0,0748166415934231 -0,0196633632216454 -2,41234983378025e-18 7,99599102208060e-19 3,25260651745651e-19 -4,74338450462408e-19 2,67662411332359e-19 2,91379333855479e-19 -2,16840434497101e-18 -4,33680868994202e-19 1,30104260698261e-18 0 0 6,86096687275983e-20
-1,33482739506506 0,0346053236774996 0,00125734163772674 0,0187469132242915 -0,00767825058738617 5,35324822664718e-19 -2,23616698075135e-19 5,08219768352580e-20 5,92923063078010e-20 1,74488787134386e-19 -4,33680868994202e-19 4,33680868994202e-19 -2,16840434497101e-19 2,16840434497101e-19 0 -1,19008129089229e-19
-0,793561224702690 0,0203250367373064 0,000727127971238783 0,000177630032830862 0,00739591005669882 -0,00376795430225022 4,98055372985529e-19 -3,84552958053452e-19 3,20178454062126e-19 -1,35525271560688e-19 6,50521303491303e-19 -1,08420217248550e-19 1,08420217248550e-19 -2,16840434497101e-19 0 -7,15742840429884e-20
-0,532255026297144 0,0135340901236068 0,000479383336751935 0,000115847127348313 4,51920594555328e-05 0,00365346070706817 -0,00212282675610843 1,37219337455197e-19 -5,14996031930615e-19 3,30342849429177e-19 0 -2,71050543121376e-19 1,08420217248550e-19 0 0 5,08219768352580e-20
-0,384130052448431 0,00972113086608457 0,000342250536212794 8,21235560483452e-05 3,18129608485860e-05 1,56232096436654e-05 0,00206784220009096 -0,00131233595800525 6,39509875176997e-20 -3,37542629480839e-19 -1,08420217248550e-19 2,16840434497101e-19 0 0 0 -8,47032947254300e-22
-0,291692030052418 0,00735419164507677 0,000257375648850429 6,15185225200113e-05 2,37495210052671e-05 1,16038017329438e-05 6,53368676878396e-06 0,00128271813402154 -0,000867362869930264 1,77876918923403e-19 1,62630325872826e-19 -1,89735380184963e-19 1,62630325872826e-19 0 0 -9,07384044746169e-20
-0,229596895430646 0,00578230937666655 0,000201707743336976 4,79768824589291e-05 1,84020572663637e-05 8,96002707181433e-06 5,05525466995835e-06 3,12009781742606e-06 0,000850011219708818 -0,000602925394011745 0 2,71050543121376e-20 -8,13151629364128e-20 5,42101086242752e-20 -5,42101086242752e-20 7,73976355553617e-20
-0,185720949479909 0,00466765632076680 0,000162419592307734 3,85318721641536e-05 1,47407053519860e-05 7,17308297585328e-06 4,02178178072207e-06 2,48428717850195e-06 1,64547815065802e-06 0,000592092919336558 -0,000435974905284452 0 0 8,13151629364128e-20 -1,08420217248550e-19 2,64697796016969e-20
-0,153867987373140 0,00385473267086607 0,000133863548213241 3,17506489004575e-05 1,20962229586152e-05 5,86799087221288e-06 3,28276799988068e-06 2,02338706451671e-06 1,33735029942045e-06 9,34275734555363e-07 0,000428867197061432 -0,000325409609345764 0 2,71050543121376e-20 0 -1,09055491958991e-20
-0,129703518509601 0,00324211947468978 0,000112403568308126 2,65969300905272e-05 1,01402805713936e-05 4,89779294849866e-06 2,73496124917826e-06 1,68586638861081e-06 1,11012300345236e-06 7,73556738632873e-07 5,60933254708493e-07 0,000320553621268105 -0,000249293253625970 5,42101086242752e-20 0 -1,01114558078482e-20
-0,110691345431593 0,00276839969825208 9,59884298624889e-05 2,25961759289096e-05 8,63052307521336e-06 4,15554692230644e-06 2,31688356971108e-06 1,42511604039733e-06 9,39229137057347e-07 6,51934526276135e-07 4,72019315851685e-07 3,53897320062806e-07 0,000245863313382516 -0,000195185934120844 0 -1,24407964127975e-20
-0,0958269169656213 0,00239699666599593 8,28626202960276e-05 1,95227026042985e-05 7,41637441475814e-06 3,57424367962823e-06 1,99334817579930e-06 1,21993241781196e-06 8,05577604288488e-07 5,57554928001086e-07 4,03155267486669e-07 3,01723475812485e-07 2,31838854154289e-07 0,000192695260333710 -0,000155673036807333 -2,34522247271034e-20
-0,0838002301027703 0,00209415237243389 7,23249901251223e-05 1,70229067498473e-05 6,46008752692950e-06 3,11455737751181e-06 1,73159030599080e-06 1,06073213436631e-06 6,96842172109705e-07 4,82764206408816e-07 3,49217230232344e-07 2,60145440758586e-07 2,00286821017368e-07 1,56906945950947e-07 0,000153820426928509 -0,000126146355001072
答案 0 :(得分:2)
问题似乎在你的gaussjordan
内核中。
当您在原始(L
)矩阵上执行gauss-jordan消除时,只能对枢轴点右侧的行元素进行处理。
但是当您将相同的行操作应用于单位矩阵以创建逆(I
)时,有必要将等效的行操作应用于行的每个成员,而不只是枢轴点右侧的那些。
因此,如果您修改gaussjordan
内核,请执行以下操作:
__global__ void gaussjordan(float *A, float *I,int n, int i)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
float P;
if(x<n && y<n)
if(x>i){ // this limits operation to rows below the pivot point
P=A[x*n+i]/A[i*n+i];
I[x*n+y] -= I[i*n+y]*P; // apply for every row member
if(y>=i){ //limits to row members to the right of the pivot
A[x*n+y] -= A[i*n+y]*P; // apply only to members right of pivot
}
}
}
我相信你会有更好的结果。通过上述更改,我相信,我可以在float
与double
的准确度内复制您的预期结果。