我基于Gauss-Jordan算法构建了一个函数,该函数基于GF(2)求矩阵求逆,该循环使用循环,该循环使用并行函数在每次迭代中固定列。 问题在于,根据倒置矩阵的大小,在进行一定数量的迭代后,并行函数的执行时间将增加,而与该迭代过程中的矩阵或变量的值没有直接关系。而且运行时间的增加发生在所有函数中,每个迭代中都有一个,而不是每个迭代中都有。
__global__ void findOne(Matrix M, int i, int *prow, int *isOne) {
if (!*isOne && *prow != -2)
int row = blockIdx.x + i;
if (M.elements[row * M.width + i])
atomicExch(prow, row);
} }
__global__ void forceOne(Matrix M, Matrix I, int i, int *prow, int *isOne) {
if (!*isOne && *prow != -1 && *prow != -2)
int col = blockIdx.x * blockDim.x + threadIdx.x;
M.elements[i * M.width + col] = (int)M.elements[*prow * M.width + col] ^ (int)M.elements[i * M.width + col];
I.elements[i * I.width + col] = (int)I.elements[*prow * I.width + col] ^ (int)I.elements[i * I.width + col];
} }
__global__ void colFix(Matrix M, Matrix I, int i, int *prow, int *isOne) {
if (*prow != -2)
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row != i)
if (M.elements[row * M.width + i])
if (col > i)
M.elements[row * M.width + col] = (int)M.elements[row * M.width + col] ^ (int)M.elements[i * M.width + col];
I.elements[row * I.width + col] = (int)I.elements[row * I.width + col] ^ (int)I.elements[i * I.width + col];
} }
__global__ void colZeroFix(Matrix M, int i, int *prow, int *isOne) {
if (*prow != -2)
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row != i)
M.elements[row * M.width + i] = 0;
*isOne = (int)M.elements[(i + 1) * M.width + i + 1];
} }
__global__ void SetError(int *prow, int *isOne) {
if (!*isOne && *prow == -1)
*prow = -2; }
__global__ void Reset(int *prow) {
if (*prow != -2)
*prow = -1; }
bool Matrix::matrixInverse(Matrix M) {
Matrix d_M, d_I, I;
d_M.width = d_M.stride = width; d_M.height = height;
d_I.width = d_I.stride = width; d_I.height = height;
size_t size = width * height * sizeof(int);
cudaMalloc(&d_M.elements, size);
cudaMemcpy(d_M.elements, elements, size, cudaMemcpyHostToDevice);
I.init(width, height, stride); I.IdentityMatrix();
cudaMalloc(&d_I.elements, size);
cudaMemcpy(d_I.elements, I.elements, size, cudaMemcpyHostToDevice);
// Setup the execution configuration
// Spawning n*n threads
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y);
int ∗prow, row = -1; //errCount = 0;
cudaMalloc((void**)&prow, sizeof(int));
cudaMemcpy(prow, &row, sizeof(int), cudaMemcpyHostToDevice);
int *isOne, is = (int)elements[0];
cudaMalloc((void**)&isOne, sizeof(int));
cudaMemcpy(isOne, &is, sizeof(int), cudaMemcpyHostToDevice);
high_resolution_clock::time_point t1, t2;
// Launch the device computation threads!
for (int i = 0; i < width; i++) {
//cout << endl << "iteration: " << i;
/*cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&is, isOne, sizeof(int), cudaMemcpyDeviceToHost);
cout << endl << "isOne: " << is;
cout << endl << "prow: " << row;*/
//t1 = high_resolution_clock::now();
findOne << < (height - i), 1 >> > (d_M, i, prow, isOne);
/*t2 = high_resolution_clock::now();
auto a = duration_cast<microseconds>(t2 - t1).count();
cout << endl << a;*/
SetError << < 1, 1 >> > (prow, isOne);
//t1 = high_resolution_clock::now();
forceOne << < width / BLOCK_SIZE, BLOCK_SIZE >> > (d_M, d_I, i, prow, isOne);
/*t2 = high_resolution_clock::now();
auto b = duration_cast<microseconds>(t2 - t1).count();
cout << endl << b;*/
/*cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
cout << endl << "prow: " << row;*/
//t1 = high_resolution_clock::now();
colFix << <dimGrid, dimBlock >> >(d_M, d_I, i, prow, isOne);
/*t2 = high_resolution_clock::now();
auto c = duration_cast<microseconds>(t2 - t1).count();
cout << endl << c;*/
//t1 = high_resolution_clock::now();
colZeroFix << < height / BLOCK_SIZE, BLOCK_SIZE >> > (d_M, i, prow, isOne);
/*t2 = high_resolution_clock::now();
auto d = duration_cast<microseconds>(t2 - t1).count();
cout << endl << d;*/
Reset << < 1, 1 >> > (prow);
// Copy back the results from device to host
cudaMemcpy(M.elements, d_I.elements, size, cudaMemcpyDeviceToHost);
//cudaMemcpy(I.elements, d_M.elements, size, cudaMemcpyDeviceToHost);
cudaMemcpy(&row, prow, sizeof(int), cudaMemcpyDeviceToHost);
if (row == -2)
return false;
return true; }
所有矩阵都是三角形的,当我开始使用尺寸64 ^ 2和128 ^ 2时,此矩阵开始可见。矩阵的元素存储在-float * elements [width * height]。