用Cuda中的Gauss-Jordan方法求复数的矩阵求逆

时间:2014-08-26 19:48:58

标签: c++ matrix cuda complex-numbers inversion

我正在尝试反转由复数组成的矩阵,其中我使用矩阵求逆代码,用于通过'用户'在以下链接中发布的实数  cuda matrix inverse gaussian jordan

代码编译,没有错误,但输出的问题是错误的!我不知道哪里出错了。 任何人,请,帮助。 提前谢谢!

这是完整的代码:

#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>

__device__ __host__ cuDoubleComplex  operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex  operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex  operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex  operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }

using namespace std;

 __global__ void gaussjordan(cuDoubleComplex *A,  cuDoubleComplex *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    cuDoubleComplex P;

    if(x<n && y<n)
        if(x>i){ 
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] = I[x*n+y] - I[i*n+y]*P; 
            if(y>=i){ 
                A[x*n+y] = A[x*n+y] - A[i*n+y]*P;  
            }
        }
 }


 __global__ void dev(cuDoubleComplex *d_A,  cuDoubleComplex *dI, int h)
{
    cuDoubleComplex temp = make_cuDoubleComplex(0,0);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x<h && y<h)
        if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
            if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){

            dI[x*h+y]  = dI[x*h+y]/d_A[x*h+x];
            d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
            }
        }
    __syncthreads();

}

int main()
{
    int const n = 3;
// creating input
    cuDoubleComplex iL[n*n],L[n*n], I[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j ) L[i*n+j] =make_cuDoubleComplex(0,1);
            else L[i*n+j] = make_cuDoubleComplex(0,0);

            printf("%.2f  ", cuCimag(L[i*n+j]));
        }
    printf("\n");
    }
printf("\n");

    cuDoubleComplex *d_A, *d_L, *dI;
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(cuDoubleComplex);

    dim3 threadsPerBlock(n/16,n/16); //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    dim3 numBlocks(16,16);     //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// memory allocation    
    cudaMalloc( (void**)  &d_A, ddsize);   
    cudaMalloc( (void**)   &dI, ddsize); 

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
                else I[i*n+j]=make_cuDoubleComplex(0,0);
        }
    }

 //copy data from GPU to CPU
    cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); 
    cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice); 
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
    }
    dev<<<numBlocks,  threadsPerBlock>>>(d_A, dI, n); 

    cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); 
    cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );


    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            printf("%.2f  ", cuCimag(iL[i*n+j]));
        }
    printf("\n");
    }
printf("\n");



    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";

    cudaFree(d_A);
    cudaFree(dI);

    system("Pause");
 return 0;
}

感谢@RobertCrovella提供快速且非常有见地的建议!关于你对我的问题的回答:我改变了我的threadsPerBlock(4,4)和numBlocks(1,1),所以我将使用1个块,16个线程用于我的4x4矩阵。我的输入矩阵如下

1  0  0  0
0  2  0  0 
0  0  3  0
0  0  0  4

这里的所有数字都是真实的,那么预期的倒置矩阵应该看起来像

1   0    0   0
0   1/2  0   0 
0   0    1/3 0
0   0    0   1/4

我根本没有得到这个。我输入了cuda memcheck工具,看看我的内核是不是在吃午饭 但它没有显示任何错误按摩。我最近开始学习CUDA并且没有太多经验。谁能提供更详细的回复?谢谢!

这是我修改后的代码。

#include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>
#include "cuComplex.h"
#include <complex>

__device__ __host__ cuDoubleComplex  operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex  operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex  operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex  operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }

using namespace std;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}




 __global__ void gaussjordan(cuDoubleComplex *A,  cuDoubleComplex *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    cuDoubleComplex P;

    if(x<n && y<n)
        if(x>i){ 
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] = I[x*n+y] - I[i*n+y]*P; 
            if(y>=i){ 
                A[x*n+y] = A[x*n+y] - A[i*n+y]*P;  
            }
        }
 }


 __global__ void dev(cuDoubleComplex *d_A,  cuDoubleComplex *dI, int h)
{
    cuDoubleComplex temp = make_cuDoubleComplex(0,0);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x<h && y<h)
        if( cuCimag(d_A[x*h+x]) != 0 ){
            if( cuCreal(d_A[x*h+x]) != 0 ){

            dI[x*h+y]  = dI[x*h+y]/d_A[x*h+x];
            d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];
            }
        }
    __syncthreads();
}

int main()
{
    int const n= 4;
// creating input
    cuDoubleComplex iL[n*n],L[n*n], I[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
            else L[i*n+j] = make_cuDoubleComplex(0,0);

            printf("%.2f ", cuCreal(L[i*n+j]));
        }
    printf("\n");
    }
printf("\n");

    cuDoubleComplex *d_A, *dI;
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(cuDoubleComplex);

    dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
    dim3 numBlocks(1,1);       //!!!!!!!!!!!!!!!!!!

// memory allocation    
    cudaMalloc( (void**)  &d_A, ddsize);   
    cudaMalloc( (void**)   &dI, ddsize); 

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
                else I[i*n+j]=make_cuDoubleComplex(0,0);
        }
    }

 //copy data from GPU to CPU
    cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); 
    cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice); 
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
        gpuErrchk( cudaPeekAtLastError() );
    }
    dev<<<numBlocks,  threadsPerBlock>>>(d_A, dI, n); 

    gpuErrchk( cudaPeekAtLastError() );

    gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost )); 
    gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost )); 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );


    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            printf("%.2f ", cuCreal(iL[i*n+j]));
        }
    printf("\n");
    }
printf("\n");



    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";

    cudaFree(d_A);
    cudaFree(dI);

    system("Pause");
 return 0;
}

1 个答案:

答案 0 :(得分:1)

免责声明:我不是矩阵求逆的专家。我还没有完成真实矩阵求逆和复杂矩阵求逆之间差异的细节(不应该有很多不同,我不会想到)。正如已经建议的那样,可能有更好/更快的方法来反转矩阵。

直接问题似乎出现在dev内核中,特别是在这里:

    if( cuCimag(d_A[x*h+x]) != cuCimag(temp)){
        if( cuCreal(d_A[x*h+x]) != cuCreal(temp)){

这要求两个所讨论的d_A矩阵元素的实部和虚部都不为零,以便dev内核完成任何工作。但是,我不认为这种情况应该是必要的。对于除法,我们可能只要求要么实部或虚部都是非零的。我认为在复数域中,只有当实部和虚部都为零时,我们实际上除以零。如果您检查cuCdiv中提供的cuComplex.h功能,您可以确定它将在什么条件下爆炸?#34;因此需要测试和避免哪些条件。我确信你的测试不正确。

对于我的简单测试用例,以下修改后的代码正常工作:

#include <stdio.h>
#include <iostream>
#include <fstream>
#include <math.h>
#include "cuComplex.h"
#include <complex>

__device__ __host__ cuDoubleComplex  operator*(cuDoubleComplex a, cuDoubleComplex b) { return cuCmul(a,b); }
__device__ __host__ cuDoubleComplex  operator+(cuDoubleComplex a, cuDoubleComplex b) { return cuCadd(a,b); }
__device__ __host__ cuDoubleComplex  operator/(cuDoubleComplex a, cuDoubleComplex b) { return cuCdiv(a,b); }
__device__ __host__ cuDoubleComplex  operator-(cuDoubleComplex a, cuDoubleComplex b) { return cuCsub(a,b); }

using namespace std;

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}




 __global__ void gaussjordan(cuDoubleComplex *A,  cuDoubleComplex *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    cuDoubleComplex P;

    if(x<n && y<n)
        if(x>i){
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] = I[x*n+y] - I[i*n+y]*P;
            if(y>=i){
                A[x*n+y] = A[x*n+y] - A[i*n+y]*P;
            }
        }
 }


 __global__ void dev(cuDoubleComplex *d_A,  cuDoubleComplex *dI, int h)
{
    cuDoubleComplex temp = make_cuDoubleComplex(0,0);
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if(x<h && y<h)
        if(( cuCimag(d_A[x*h+x]) != 0 ) || ( cuCreal(d_A[x*h+x]) != 0 )){

            dI[x*h+y]  = dI[x*h+y]/d_A[x*h+x];
            d_A[x*h+y] = d_A[x*h+y]/d_A[x*h+x];

        }
    __syncthreads();
}

int main()
{
    int const n= 4;
// creating input
    cuDoubleComplex iL[n*n],L[n*n], I[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j ) L[i*n+j] =make_cuDoubleComplex(i+1,0);
            else L[i*n+j] = make_cuDoubleComplex(0,0);

            printf("%.2f ", cuCreal(L[i*n+j]));
        }
    printf("\n");
    }
printf("\n");

    cuDoubleComplex *d_A, *dI;
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(cuDoubleComplex);

    dim3 threadsPerBlock(n,n); //!!!!!!!!!!!!!!!!!!
    dim3 numBlocks(1,1);       //!!!!!!!!!!!!!!!!!!

// memory allocation
    cudaMalloc( (void**)  &d_A, ddsize);
    cudaMalloc( (void**)   &dI, ddsize);

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=make_cuDoubleComplex(1,0);
                else I[i*n+j]=make_cuDoubleComplex(0,0);
        }
    }

 //copy data from GPU to CPU
    cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice);
    cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice);
//timer start
    cudaEventRecord( start, 0);
// L^(-1)
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
        gpuErrchk( cudaPeekAtLastError() );
    }
    dev<<<numBlocks,  threadsPerBlock>>>(d_A, dI, n);

    gpuErrchk( cudaPeekAtLastError() );

    gpuErrchk(cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ));
    gpuErrchk(cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ));

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );


    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            printf("%.2f ", cuCreal(iL[i*n+j]));
        }
    printf("\n");
    }
printf("\n");



    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";

    cudaFree(d_A);
    cudaFree(dI);

 return 0;
}

最终免责声明:我并不是说这是一种完全验证的任意维矩阵求逆方法。我只是简单地指出一个关键的错误,它似乎使你的简单测试用例失败了。我也在你上一个问题中表达了一些保留意见。