我一直在努力实现LU Decomposition。此代码适用于大小为100x100
的矩阵,但对于较大的矩阵则无效。我不清楚发生了什么。我更改了最大查找功能,但结果相似。请帮忙。
__device__ float max_value = 0;
__device__ int max_index;
__device__ int lock = 0;
__device__ int blockLock = 0;
__global__ void maxIndex(float *LU, int n, int col)
{
max_value = 0;
max_index = INT_MAX;
__shared__ float inter[64];
__shared__ int indexBlock;
__shared__ int threadLast;
indexBlock = INT_MAX;
threadLast = INT_MAX;
int index = n*(blockDim.x*blockIdx.x + threadIdx.x + col) + col;
float myValue = fabsf(LU[index]);
int noElem = n - col - blockIdx.x*blockDim.x > blockDim.x ? blockDim.x : n - col - blockIdx.x*blockDim.x;
if(index < n*n)
inter[threadIdx.x] = fabsf(LU[index]);
else
inter[threadIdx.x] = FLT_MIN;
__syncthreads();
int h = ceil(log2((float)noElem));
for(int d=0; d<h; d++)
{
if(threadIdx.x < ceil(noElem/exp2((float)(d+1))))
{
int parent = threadIdx.x;
int left = 2*parent;
int right = 2*parent + 1;
if(right < ceil(noElem/exp2((float)d)))
inter[parent] = (inter[left] >= inter[right]) ? inter[left] : inter[right];
else
inter[parent] = inter[left];
}
__syncthreads();
}
__syncthreads();
int loop = 1;
if(myValue == inter[0])
{
while(loop)
{
if( 0 == atomicCAS( &blockLock, 0, 1 ) )
{
if(threadIdx.x < threadLast)
{
threadLast = threadIdx.x;
indexBlock = blockDim.x*blockIdx.x + threadIdx.x + col;
}
__threadfence_block();
atomicExch( &blockLock, 0);
loop = 0;
}
}
}
__syncthreads();
if( threadIdx.x == 0 && index < n*n)
{
while(0 != atomicCAS(&lock, 0, 1));
if((max_value < inter[0]) || (max_value == inter[0] && indexBlock < max_index))
{
max_value = inter[0];
max_index = indexBlock;
}
__threadfence();
atomicExch(&lock, 0);
}
}
__global__ void swap(float *LU, int n, int row)
{
if(max_index == row)
return;
float temp;
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < n)
{
temp = LU[index + n*row];
LU[index + n*row] = LU[index + n*max_index];
LU[index + n*max_index] = temp;
}
}
__global__ void elimination(float *LU, int n, int row)
{
float factor;
int indexX = row + 1 + blockDim.x*blockIdx.x + threadIdx.x;
int indexY = row + 1 + blockIdx.y;
if((indexX < n) && (indexY < n))
{
factor = LU[n*indexY + row] / LU[n*row + row];
LU[n*indexY + indexX] -= LU[n*row + indexX]*factor;
}
__syncthreads();
if(blockIdx.x == 0 && threadIdx.x == 0)
{
LU[n*indexY + row] = factor;
__threadfence();
}
}
int luDecomposeP(float *LU, int n)
{
int i, noOfThreadsPerBlock = 64, noOfBlocks, sharedSize, pivotValue;
float *dLU;
cudaMalloc((void **)&dLU, n*n*sizeof(float));
cudaMemcpy(dLU, LU, n*n*sizeof(float), cudaMemcpyHostToDevice);
dim3 gridDim(1,1,1);
dim3 blockDim(noOfThreadsPerBlock,1,1);
for(i=0; i<n-1; i++)
{
noOfBlocks = ceil((float)(n-i) / (float)noOfThreadsPerBlock);
sharedSize = ((noOfThreadsPerBlock < (n-i) ? noOfThreadsPerBlock : n-i) + 3) * sizeof(float);
maxIndex <<< noOfBlocks, noOfThreadsPerBlock, sharedSize >>> (dLU, n, i);
//maxIndex <<< 1, noOfThreadsPerBlock >>> (dLU, n, i);
cudaMemcpyFromSymbol(&pivotValue, "max_value", sizeof(pivotValue), 0, cudaMemcpyDeviceToHost);
if(pivotValue <= 1e-20F)
return -1;
noOfBlocks = ceil((float)n / (float)noOfThreadsPerBlock);
swap <<< noOfBlocks, noOfThreadsPerBlock >>> ( dLU, n, i);
gridDim.x = ceil((float)(n-i) / (float)noOfThreadsPerBlock);
gridDim.y = n-i-1;
elimination <<< gridDim, blockDim >>> ( dLU, n, i);
}
cudaMemcpy(LU, dLU, n*n*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(dLU);
return 0;
}
答案 0 :(得分:2)
虽然这个问题可能已经解决了,但我认为其他用户提供一个关于如何使用cuBLAS cublas<t>getrfBatched()
来提取矩阵的LU分解的示例是有用的。
#include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cublas_v2.h"
#include "Utilities.cuh"
int main() {
const unsigned int N = 3;
const unsigned int Nmatrices = 1;
cublasHandle_t handle;
cublasSafeCall(cublasCreate(&handle));
// --- Matrices to be inverted (only one in this example)
float *h_A = new float[N*N*Nmatrices];
h_A[0] = 4.f;
h_A[1] = 3.f;
h_A[2] = 8.f;
h_A[3] = 9.f;
h_A[4] = 5.f;
h_A[5] = 1.f;
h_A[6] = 2.f;
h_A[7] = 7.f;
h_A[8] = 6.f;
// --- Allocate device matrices
float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N*N*Nmatrices*sizeof(float)));
// --- Move the matrix to be inverted from host to device
gpuErrchk(cudaMemcpy(d_A,h_A,N*N*Nmatrices*sizeof(float),cudaMemcpyHostToDevice));
// --- Creating the array of pointers needed as input to the batched getrf
float **h_inout_pointers = (float **)malloc(Nmatrices*sizeof(float *));
for (int i=0; i<Nmatrices; i++) h_inout_pointers[i]=(float *)((char*)d_A+i*((size_t)N*N)*sizeof(float));
float **d_inout_pointers;
gpuErrchk(cudaMalloc((void**)&d_inout_pointers, Nmatrices*sizeof(float *)));
gpuErrchk(cudaMemcpy(d_inout_pointers,h_inout_pointers,Nmatrices*sizeof(float *),cudaMemcpyHostToDevice));
free(h_inout_pointers);
int *d_PivotArray; gpuErrchk(cudaMalloc((void**)&d_PivotArray, N*Nmatrices*sizeof(int)));
int *d_InfoArray; gpuErrchk(cudaMalloc((void**)&d_InfoArray, Nmatrices*sizeof(int)));
int *h_PivotArray = (int *)malloc(N*Nmatrices*sizeof(int));
int *h_InfoArray = (int *)malloc( Nmatrices*sizeof(int));
cublasSafeCall(cublasSgetrfBatched(handle, N, d_inout_pointers, N, d_PivotArray, d_InfoArray, Nmatrices));
//cublasSafeCall(cublasSgetrfBatched(handle, N, d_inout_pointers, N, NULL, d_InfoArray, Nmatrices));
gpuErrchk(cudaMemcpy(h_InfoArray,d_InfoArray,Nmatrices*sizeof(int),cudaMemcpyDeviceToHost));
for (int i = 0; i < Nmatrices; i++)
if (h_InfoArray[i] != 0) {
fprintf(stderr, "Factorization of matrix %d Failed: Matrix may be singular\n", i);
cudaDeviceReset();
exit(EXIT_FAILURE);
}
gpuErrchk(cudaMemcpy(h_A,d_A,N*N*sizeof(float),cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_PivotArray,d_PivotArray,N*Nmatrices*sizeof(int),cudaMemcpyDeviceToHost));
for (int i=0; i<N*N; i++) printf("A[%i]=%f\n", i, h_A[i]);
printf("\n\n");
for (int i=0; i<N; i++) printf("P[%i]=%i\n", i, h_PivotArray[i]);
return 0;
}
编译和运行此示例所需的Utilities.cuh
和Utilities.cu
文件保存在CUDA-Utilities存储库中。
请注意
cublas<t>getrfBatched()
也可以在不使用透视的情况下调用,请参阅注释后的cublas<t>getrfBatched()
致电线; cublas<t>getrfBatched()
能够执行许多矩阵的LU分解。上面提供的示例的结构是通用的,尽管它仅涉及一个矩阵的情况。上面给出的示例与Scientific Computing Software Library (SCSL) User’s Guide提供的示例相同,请参见示例3-1,第21页。请注意cublas<t>getrfBatched()
使用相应的LU分解覆盖输入矩阵。完整的L
和U
矩阵可以通过以下Matlab代码提取(cuBLAS用户指南中详述的Matlab代码似乎不正确):
L = eye(3);
for k = 1:3
L(k+1:3,k) = M(k+1:3,k);
end
U = zeros(3);
for k = 1:3
U(k,k:3) = M(k,k:3);
end
置换矩阵 P
可以通过d_PivotArray
向量中的以下Matlab代码提取:
P1 = eye(3);
temp = P1(:,1);
P1(:,1) = P1(:,3);
P1(:,3) = temp;
P2 = eye(3);
temp = P2(:,2);
P2(:,2) = P2(:,3);
P2(:,3) = temp;
关于排列矩阵,cuBLAS用户指南和科学计算软件库(SCSL)用户指南似乎都是错误的。
通过这种方式,
P * A = L * U
,其中
P = P2 * P1
答案 1 :(得分:1)
消除内核中的同步仅适用于同一块的线程。对于大矩阵赋值
LU [n * indexY + row] = factor;
在运算符
之前的某些块中进行求值factor = LU [n * indexY + row] / LU [n * row + row];
因此您获得的值不正确。