Question

我基于Gauss-Jordan算法构建了一个函数，该函数基于GF（2）求矩阵求逆，该循环使用循环，该循环使用并行函数在每次迭代中固定列。问题在于，根据倒置矩阵的大小，在进行一定数量的迭代后，并行函数的执行时间将增加，而与该迭代过程中的矩阵或变量的值没有直接关系。而且运行时间的增加发生在所有函数中，每个迭代中都有一个，而不是每个迭代中都有。

我的代码如下：

__global__ void findOne(Matrix M, int i, int *prow, int *isOne) {
if (!*isOne && *prow != -2)
{
    int row = blockIdx.x + i;

    if (M.elements[row * M.width + i])
        atomicExch(prow, row);
} }

__global__ void forceOne(Matrix M, Matrix I, int i, int *prow, int *isOne) {
if (!*isOne && *prow != -1 && *prow != -2)
{
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    M.elements[i * M.width + col] = (int)M.elements[*prow * M.width + col] ^ (int)M.elements[i * M.width + col];
    I.elements[i * I.width + col] = (int)I.elements[*prow * I.width + col] ^ (int)I.elements[i * I.width + col];
} }

__global__ void colFix(Matrix M, Matrix I, int i, int *prow, int *isOne) {
if (*prow != -2)
{
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if (row != i)
        if (M.elements[row * M.width + i])
        {
            if (col > i)
                M.elements[row * M.width + col] = (int)M.elements[row * M.width + col] ^ (int)M.elements[i * M.width + col];

            I.elements[row * I.width + col] = (int)I.elements[row * I.width + col] ^ (int)I.elements[i * I.width + col];
        }
} }

__global__ void colZeroFix(Matrix M, int i, int *prow, int *isOne) {
if (*prow != -2)
{
    int row = blockIdx.x * blockDim.x + threadIdx.x;

    if (row != i)
        M.elements[row * M.width + i] = 0;

    else
        *isOne = (int)M.elements[(i + 1) * M.width + i + 1];
} }

__global__ void SetError(int *prow, int *isOne) {
if (!*isOne && *prow == -1)
    *prow = -2; }

__global__ void Reset(int *prow) {
if (*prow != -2)
    *prow = -1; }

bool Matrix::matrixInverse(Matrix M) {
Matrix d_M, d_I, I;
d_M.width = d_M.stride = width; d_M.height = height;
d_I.width = d_I.stride = width; d_I.height = height;
size_t size = width * height * sizeof(int);

cudaMalloc(&d_M.elements, size);
cudaMemcpy(d_M.elements, elements, size, cudaMemcpyHostToDevice);

I.init(width, height, stride); I.IdentityMatrix();
cudaMalloc(&d_I.elements, size);
cudaMemcpy(d_I.elements, I.elements, size, cudaMemcpyHostToDevice);

// Setup the execution configuration
// Spawning n*n threads
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);

int ∗prow, row = -1; //errCount = 0;
cudaMalloc((void**)&prow, sizeof(int));
cudaMemcpy(prow, &row, sizeof(int), cudaMemcpyHostToDevice);

int *isOne, is = (int)elements[0];
cudaMalloc((void**)&isOne, sizeof(int));
cudaMemcpy(isOne, &is, sizeof(int), cudaMemcpyHostToDevice);

high_resolution_clock::time_point t1, t2;

// Launch the device computation threads! 
for (int i = 0; i < width; i++) {

    //cout << endl << "iteration: " << i;

    /*cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&is, isOne, sizeof(int), cudaMemcpyDeviceToHost);

    cout << endl << "isOne: " << is;
    cout << endl << "prow: " << row;*/

    //t1 = high_resolution_clock::now();
    findOne << < (height - i), 1 >> > (d_M, i, prow, isOne);
    /*t2 = high_resolution_clock::now();
    auto a = duration_cast<microseconds>(t2 - t1).count();
    cout << endl << a;*/

    SetError << < 1, 1 >> > (prow, isOne);

    //t1 = high_resolution_clock::now();
    forceOne << < width / BLOCK_SIZE, BLOCK_SIZE >> > (d_M, d_I, i, prow, isOne);
    /*t2 = high_resolution_clock::now();
    auto b = duration_cast<microseconds>(t2 - t1).count();
    cout << endl << b;*/

    /*cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
    cout << endl << "prow: " << row;*/

    //t1 = high_resolution_clock::now();
    colFix << <dimGrid, dimBlock >> >(d_M, d_I, i, prow, isOne);
    /*t2 = high_resolution_clock::now();
    auto c = duration_cast<microseconds>(t2 - t1).count();
    cout << endl << c;*/

    //t1 = high_resolution_clock::now();
    colZeroFix << < height / BLOCK_SIZE, BLOCK_SIZE >> > (d_M, i, prow, isOne);
    /*t2 = high_resolution_clock::now();
    auto d = duration_cast<microseconds>(t2 - t1).count();
    cout << endl << d;*/

    Reset << < 1, 1 >> > (prow);
}

// Copy back the results from device to host
cudaMemcpy(M.elements, d_I.elements, size, cudaMemcpyDeviceToHost);
//cudaMemcpy(I.elements, d_M.elements, size, cudaMemcpyDeviceToHost);

cudaFree(prow);
cudaFree(isOne);
cudaFree(d_M.elements);
cudaFree(d_I.elements);

cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);

if (row == -2)
    return false;
return true; }

所有矩阵都是三角形的，当我开始使用尺寸64 ^ 2和128 ^ 2时，此矩阵开始可见。矩阵的元素存储在-float * elements [width * height]。

每次迭代的Cuda矩阵求逆执行时间增加

0 个答案: