Question

我一直在努力实现LU Decomposition。此代码适用于大小为100x100的矩阵，但对于较大的矩阵则无效。我不清楚发生了什么。我更改了最大查找功能，但结果相似。请帮忙。

 __device__ float max_value = 0;  
 __device__ int max_index;
 __device__ int lock = 0;
 __device__ int blockLock = 0;

__global__ void maxIndex(float *LU, int n, int col)
 {
max_value = 0;
max_index = INT_MAX;

__shared__ float inter[64];
__shared__ int indexBlock;
__shared__ int threadLast;

indexBlock = INT_MAX;
threadLast = INT_MAX;

int index = n*(blockDim.x*blockIdx.x + threadIdx.x + col) + col;
float myValue = fabsf(LU[index]);
int noElem = n - col - blockIdx.x*blockDim.x > blockDim.x ? blockDim.x : n - col - blockIdx.x*blockDim.x;

if(index < n*n)
    inter[threadIdx.x] = fabsf(LU[index]);
else
    inter[threadIdx.x] = FLT_MIN;

__syncthreads();

int h = ceil(log2((float)noElem));

for(int d=0; d<h; d++)
{
    if(threadIdx.x < ceil(noElem/exp2((float)(d+1))))
    {
            int parent = threadIdx.x;
                int left = 2*parent;
                int right = 2*parent + 1;

                if(right < ceil(noElem/exp2((float)d)))
                    inter[parent] = (inter[left] >= inter[right]) ? inter[left] : inter[right];
            else
                    inter[parent] = inter[left];
    }
    __syncthreads();
}

__syncthreads();

int loop = 1;
if(myValue == inter[0])
{
    while(loop)
    {
        if( 0 == atomicCAS( &blockLock, 0, 1 ) )
        {
            if(threadIdx.x < threadLast)
            {
                threadLast = threadIdx.x;
                indexBlock = blockDim.x*blockIdx.x + threadIdx.x + col;
            }
            __threadfence_block();

            atomicExch( &blockLock, 0);
            loop = 0;
        }
     }
}   

__syncthreads();

if( threadIdx.x == 0 && index < n*n)
{
    while(0 != atomicCAS(&lock, 0, 1));

    if((max_value < inter[0]) || (max_value == inter[0] && indexBlock < max_index))
    {
        max_value = inter[0];
        max_index = indexBlock;
    }

    __threadfence();

    atomicExch(&lock, 0);   
}
}

__global__ void swap(float *LU, int n, int row)
{
if(max_index == row)
    return;

float temp;
int index =  blockDim.x*blockIdx.x + threadIdx.x;

if(index < n)
{
    temp = LU[index + n*row];
    LU[index + n*row] = LU[index + n*max_index];
    LU[index + n*max_index] = temp;
}

  }

 __global__ void elimination(float *LU, int n, int row)
 {
float factor;

int indexX = row + 1 + blockDim.x*blockIdx.x + threadIdx.x;
int indexY = row + 1 + blockIdx.y;


if((indexX < n) && (indexY < n))
{
    factor = LU[n*indexY + row] / LU[n*row + row];

    LU[n*indexY + indexX] -= LU[n*row + indexX]*factor; 
}

__syncthreads();

if(blockIdx.x == 0 && threadIdx.x == 0)
{
    LU[n*indexY + row] = factor;
    __threadfence();
}

}

int luDecomposeP(float *LU, int n)
{
int i, noOfThreadsPerBlock = 64, noOfBlocks, sharedSize, pivotValue;
float *dLU;

cudaMalloc((void **)&dLU, n*n*sizeof(float));

cudaMemcpy(dLU, LU, n*n*sizeof(float), cudaMemcpyHostToDevice);

dim3 gridDim(1,1,1);
dim3 blockDim(noOfThreadsPerBlock,1,1);

for(i=0; i<n-1; i++)
{
    noOfBlocks = ceil((float)(n-i) / (float)noOfThreadsPerBlock);
    sharedSize = ((noOfThreadsPerBlock < (n-i) ? noOfThreadsPerBlock : n-i) + 3) * sizeof(float);
    maxIndex <<< noOfBlocks, noOfThreadsPerBlock, sharedSize >>> (dLU, n, i);
    //maxIndex <<< 1, noOfThreadsPerBlock >>> (dLU, n, i);

    cudaMemcpyFromSymbol(&pivotValue, "max_value", sizeof(pivotValue), 0, cudaMemcpyDeviceToHost);      
    if(pivotValue <= 1e-20F)
        return -1;

    noOfBlocks = ceil((float)n / (float)noOfThreadsPerBlock);
    swap <<< noOfBlocks, noOfThreadsPerBlock >>> ( dLU, n, i);

    gridDim.x = ceil((float)(n-i) / (float)noOfThreadsPerBlock);
    gridDim.y = n-i-1;
    elimination <<< gridDim, blockDim >>> ( dLU, n, i);

}

cudaMemcpy(LU, dLU, n*n*sizeof(float), cudaMemcpyDeviceToHost);

cudaFree(dLU);

return 0;
 }

Answer 1

虽然这个问题可能已经解决了，但我认为其他用户提供一个关于如何使用cuBLAS cublas<t>getrfBatched()来提取矩阵的LU分解的示例是有用的。

#include <stdio.h>

#include "cuda_runtime.h" 
#include "device_launch_parameters.h"

#include "cublas_v2.h"

#include "Utilities.cuh"

int main() {

    const unsigned int N = 3; 

    const unsigned int Nmatrices = 1;

    cublasHandle_t handle;
    cublasSafeCall(cublasCreate(&handle));

    // --- Matrices to be inverted (only one in this example)
    float *h_A = new float[N*N*Nmatrices];

    h_A[0] = 4.f;  
    h_A[1] = 3.f;
    h_A[2] = 8.f;
    h_A[3] = 9.f;
    h_A[4] = 5.f; 
    h_A[5] = 1.f; 
    h_A[6] = 2.f; 
    h_A[7] = 7.f;
    h_A[8] = 6.f;

    // --- Allocate device matrices 
    float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N*N*Nmatrices*sizeof(float)));

    // --- Move the matrix to be inverted from host to device
    gpuErrchk(cudaMemcpy(d_A,h_A,N*N*Nmatrices*sizeof(float),cudaMemcpyHostToDevice));

    // --- Creating the array of pointers needed as input to the batched getrf
    float **h_inout_pointers = (float **)malloc(Nmatrices*sizeof(float *));
    for (int i=0; i<Nmatrices; i++) h_inout_pointers[i]=(float *)((char*)d_A+i*((size_t)N*N)*sizeof(float));

    float **d_inout_pointers;
    gpuErrchk(cudaMalloc((void**)&d_inout_pointers, Nmatrices*sizeof(float *)));
    gpuErrchk(cudaMemcpy(d_inout_pointers,h_inout_pointers,Nmatrices*sizeof(float *),cudaMemcpyHostToDevice));
    free(h_inout_pointers);

    int *d_PivotArray; gpuErrchk(cudaMalloc((void**)&d_PivotArray, N*Nmatrices*sizeof(int)));
    int *d_InfoArray;  gpuErrchk(cudaMalloc((void**)&d_InfoArray,  Nmatrices*sizeof(int)));

    int *h_PivotArray = (int *)malloc(N*Nmatrices*sizeof(int));
    int *h_InfoArray  = (int *)malloc(  Nmatrices*sizeof(int));

    cublasSafeCall(cublasSgetrfBatched(handle, N, d_inout_pointers, N, d_PivotArray, d_InfoArray, Nmatrices));
    //cublasSafeCall(cublasSgetrfBatched(handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));

    gpuErrchk(cudaMemcpy(h_InfoArray,d_InfoArray,Nmatrices*sizeof(int),cudaMemcpyDeviceToHost));

    for (int i = 0; i < Nmatrices; i++)
        if (h_InfoArray[i]  != 0) {
            fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
            cudaDeviceReset();
            exit(EXIT_FAILURE);
        }

    gpuErrchk(cudaMemcpy(h_A,d_A,N*N*sizeof(float),cudaMemcpyDeviceToHost));
    gpuErrchk(cudaMemcpy(h_PivotArray,d_PivotArray,N*Nmatrices*sizeof(int),cudaMemcpyDeviceToHost));

    for (int i=0; i<N*N; i++) printf("A[%i]=%f\n", i, h_A[i]);

    printf("\n\n");
    for (int i=0; i<N; i++) printf("P[%i]=%i\n", i, h_PivotArray[i]);

    return 0;
}

编译和运行此示例所需的Utilities.cuh和Utilities.cu文件保存在CUDA-Utilities存储库中。

请注意

cublas<t>getrfBatched()也可以在不使用透视的情况下调用，请参阅注释后的cublas<t>getrfBatched()致电线;
cublas<t>getrfBatched()能够执行许多矩阵的LU分解。上面提供的示例的结构是通用的，尽管它仅涉及一个矩阵的情况。

上面给出的示例与Scientific Computing Software Library (SCSL) User’s Guide提供的示例相同，请参见示例3-1，第21页。请注意cublas<t>getrfBatched()使用相应的LU分解覆盖输入矩阵。完整的L和U矩阵可以通过以下Matlab代码提取（cuBLAS用户指南中详述的Matlab代码似乎不正确）：

L = eye(3);
for k = 1:3
    L(k+1:3,k) = M(k+1:3,k);
end
U = zeros(3);
for k = 1:3
    U(k,k:3) = M(k,k:3);
end

置换矩阵 P可以通过d_PivotArray向量中的以下Matlab代码提取：

P1 = eye(3);
temp = P1(:,1);
P1(:,1) = P1(:,3);
P1(:,3) = temp;

P2 = eye(3);
temp = P2(:,2);
P2(:,2) = P2(:,3);
P2(:,3) = temp;

关于排列矩阵，cuBLAS用户指南和科学计算软件库（SCSL）用户指南似乎都是错误的。

通过这种方式，

P * A = L * U

，其中

P = P2 * P1

Answer 2

消除内核中的同步仅适用于同一块的线程。对于大矩阵赋值

LU [n * indexY + row] = factor;

在运算符

之前的某些块中进行求值

factor = LU [n * indexY + row] / LU [n * row + row];

因此您获得的值不正确。

在CUDA中的LU分解

2 个答案: