Question

简要概述：程序的主要结构是不平衡的树（不是二进制），是我动态创建的，一旦创建节点，我就会检查是否是解决方案（然后程序可以终止）或不。树是不平衡的，所以我可以有一个带有N个节点的分支，一个带有1个节点的分支和一个带有n ^ 2个节点的分支。但是节点的创建和检查很少，因此我不能为每个线程仅分配1个节点，否则开销将超过执行时间。我该怎么办？

嗨，我已经完成了这个数独的数独求解器，我想对其进行并行化处理，但是我不确定如果使用管道，服务器场，简化地图等等是最好的方法。我将向您发布顺序代码，以便于并行化。该程序的主要结构是一棵树，是一棵蛮力逼近的蜂，每种可能性都是一个具有自己的sudokuMatrix的节点。程序的逻辑遵循以下步骤： -它需要在输入中输入数独矩阵或生成它。 -找到第一个空单元格，然后计算可以在其中插入哪个数字。（遵守数独规则） -它为每种可能性生成一个节点（并将其作为“子”添加到父节点），每个节点都有一个矩阵，并在sudokuMatrix中添加了相应的数字。 -这样继续进行递归，直到找到解决方案为止。

！该算法必须是蛮力的，这是不可编辑的要求，但是我当然可以选择另一种蛮力方法，即使我想保留它也不会改变太多。

现在，我尝试并行化的方法是，将刚开始的工作或多或少地分成相等工作量的n个部分（其中n =选择的线程数），然后将它们分配给具有场的线程。由于递归将创建一个非常大的不平衡树，因此我只按顺序进行前1-2步，直到我至少有一个等于该数目的子代（将其想象为一棵大树的第一或第二级）的工人，然后将它们等分，然后将子树提供给工人（这些子树当然是不平衡的，但是由于它是动态创建的，因此我无法更精确地平衡它）。

我面临的问题是我无法浏览所有的递归树，无法等分并分配给线程，因为我是动态生成树的，而一旦生成解决方案矩阵，所有程序终止，因为我找到了解决方案。另一方面，我无法解决树的每个级别将工作拆分到所有线程的问题，因为每个级别的工作很少，这会导致很大的开销。

什么是使该程序并行化的更好方法？（我知道暴力破解方法效率不高，依此类推，但是同样，我无法更改它，更确切地说，它必须是暴力破解方法，所以即使效率更高一点，暴力破解方法也会存在）。 / p>

我只向您发布顺序解决方案（这是完整的代码，因此您可以根据需要重现它）：！请注意，在递归调用（solveSudoku）上，我增加了一点睡眠以模拟更多工作，否则该功能太快，开销将使其无法承受。

#include <iostream>
#include <unistd.h>
#include <bits/stdc++.h>
#include <typeinfo>
#include <chrono>
#include <thread>

#define UNASSIGNED 0
#define N 9
#define ERROR_PAIR std::make_pair(-1,-1)

using namespace std;
atomic<bool> solutionFound{false};

//declaration for new tree node
struct Node 
{ 
    array<unsigned char, N*N> grid;
    vector<Node *>child; 
}; 
   
 // Utility function to create a new tree node 
Node *newNode(const array<unsigned char, N*N> &newGrid) 
{ 
    Node *temp = new Node; 
    temp->grid = newGrid;
    return temp; 
} 
  

void printGrid(const array<unsigned char, N*N> &grid) 
{ 
    for (int row = 0; row < N; row++) {
        if(row==3 || row == 6){
            cout << "---------------------" << endl;
        } 
        for (int col = 0; col < N; col++){
            if(col==3 || col==6){
                cout << "| ";
            }
            cout << (int)grid[row+col*N] << " "; 
             
        }
        cout << endl;
    } 
} 

bool canInsert(const int &val,const int &row_, const int &col_,const array<unsigned char, N*N> &grid){
    //check column
    for(int row = 0; row < N; row ++){
        if (grid[row+col_*N] == val) return false;
    }
    //check row
    for(int col = 0;col < N; col ++){
        if (grid[row_+col*N] == val) return false;
    }
    //check box
    for(int row = 0; row < N; row++){
        for(int col = 0; col < N; col++){
            if (row/3 == row_/3 && col/3 == col_/3){ //they are in the same square 3x3
                if((grid[row+col*N] == val)) return false;
            }
        }
    }
    return true;
}

//vector<vector<int>> gridTest(9, vector<int>(9,0)); il vettore deve essere inizializzato, cosi.
//n = how many numbers you want to initialize the matrix with
void generateMatrix(const int &seed,const int &n,array<unsigned char, N*N> &grid){ 
    srand(seed);
    int i = 0;
    while ( i < n){
        int row = rand()%9;
        int col = rand()%9;
        int val = rand()%9+1;
        if(grid[row+col*N]== UNASSIGNED && canInsert(val,row,col,grid)){
        grid[row+col*N] = val;
        i++;
        }
    }
    return;
}

bool isSafe(const array<unsigned char, N*N> &grid) //check if the sudoku is solved
{ 
    char row_[9][N+1] = {0};
    char column_[9][N+1] = {0};
    char box[3][3][N+1] = {0};
    for (int row = 0; row < N; row++) { 
        for (int col = 0; col < N; col++) { 
            // mark the element in row column and box 
            row_[row][grid[row+col*N]] += 1; 
            column_[col][grid[row+col*N]] += 1; 
            box[row / 3][col / 3][grid[row+col*N]] += 1; 
  
            // if an element is already 
            // present in the hashmap 
            if ( 
                box[row / 3][col / 3][grid[row+col*N]] > 1 
                || column_[col][grid[row+col*N]] > 1 
                || row_[row][grid[row+col*N]] > 1) 
                return false; 
        } 
    }
    return true; 
} 


pair<int,int> findCell(const array<unsigned char, N*N> &grid){
  for (int i=0;i<N;i++){
      for(int j=0;j<N;j++){
          if (grid[i+j*N] ==UNASSIGNED){
              return make_pair(i,j);
          }
      }
  }
  return ERROR_PAIR;
}

void addChoices (list<array<unsigned char, N*N>> &choices,Node &node){
    while(!choices.empty()){
            node.child.push_back(newNode(choices.front()));
            choices.pop_front();
        }
    return;
}

list<array<unsigned char, N*N>> getChoices(const int &row,const int &col,const array<unsigned char, N*N> &grid){
    list<array<unsigned char, N*N>> choices;
    for(int i=1;i<10;i++){
        if (canInsert(i,row,col,grid)) { 
            array<unsigned char, N*N> tmpGrid = grid;
            tmpGrid[row+col*N] = i;
            choices.push_back(move(tmpGrid));
        } 
    }
    return choices;
}

void solveSudoku(vector<Node *> &nodes){
    std::this_thread::sleep_for(std::chrono::milliseconds(10));
    if(solutionFound) {return;};
    for (Node *&n : nodes) {
    if (findCell(n->grid) != ERROR_PAIR){
        pair<int,int> freeCell = findCell(n->grid);
        list<array<unsigned char, N*N>> choices = getChoices(freeCell.first,freeCell.second,n->grid);
        if(choices.empty()) { 
        return ;
        }
        addChoices(choices,*n);
        solveSudoku(n->child);
    } else if(isSafe(n->grid)){
            solutionFound = true;
            printGrid(n->grid);
            cout << "That's the first solution found !" << endl;
            return;
    } else {
        cout << "No solution found ! " << endl;
        return;
    }
    }
}



//TO DO: fai inserire quando chiami il programma se vuole generare una matrice, true o false, e se si quanti numeri iniziali dare.
int main(int argc, char * argv[]) {
    chrono::high_resolution_clock::time_point t1 = chrono::high_resolution_clock::now();
    array<unsigned char, N*N> grid = 
                                      {3, 0, 6, 5, 0, 8, 4, 0, 0, 
                                       5, 2, 0, 0, 0, 0, 0, 0, 0, 
                                       0, 8, 7, 0, 0, 0, 0, 3, 1, 
                                       0, 0, 3, 0, 1, 0, 0, 8, 0, 
                                       9, 0, 0, 8, 6, 3, 0, 0, 5, 
                                       0, 5, 0, 0, 9, 0, 6, 0, 0, 
                                       1, 3, 0, 0, 0, 0, 2, 5, 0, 
                                       0, 0, 0, 0, 0, 0, 0, 7, 4, 
                                       0, 0, 5, 2, 0, 6, 3, 0, 0};
    
    Node *root = newNode(grid);
    vector<Node *> vec;
    vec.push_back(root);
    solveSudoku(vec);

     chrono::high_resolution_clock::time_point t2 = chrono::high_resolution_clock::now();
     chrono::duration<double> time_span = chrono::duration_cast<chrono::duration<double>>(t2 - t1);
    cout << "end" << endl;
        std::cout << "It took me " << time_span.count() << " seconds." << endl;
    return(0);
}

Answer 1

简单的解决方案：OpenMp和计划（动态）

如果您能够将代码转换成大循环，则可以使用 #pragma omp parallel for schedule(dynamic)来并行化整个循环。对于您的具体案例，我无法提供示例，因此我选择了ray tracing：

void paint(char* picture, int width, int height) {
# pragma omp parallel for collapse(2) schedule(dynamic)
    for(int i=0; i < width; ++i) {
        for(int j=0; j < height; ++j) {
           picture[i*width+j] = calc_pixel(i, j);
        }
    }

}

在光线跟踪中，您可以自己计算每个像素，而无需知道所有其他像素。但是计算像素可能需要花费不同的时间，因此，如果仅给每个线程相同数量的像素，则工作量将不平衡。

请注意，为了进行调度（动态），您需要一个中央工作分配，其中线程将要求进行一项工作，访问一个共享（同步）队列或类似的队列以获得一个。在这种情况下，与该开销相比，一个像素的功将太小。

为解决此问题，我们可以在schedule（dynamic）中添加第二个参数：迭代次数，应视为“一个工作单元”。假设我们希望将200个像素视为“一个工作单元”，我们可以简单地声明一下schedule（dynamic，200）。这样会产生更多的负载不平衡，但同步开销会更少。

返回您的示例

比方说我们不能使用OpenMp。可能不允许使用OpenMp，您可能不满意使用OpenMp，或者根本无法使用for循环解决问题。我们仍然可以从这种方法中学习，并为您的问题手工构建。

日程安排（动态）背后的想法很简单：工作排成一列，每当有人完成自己的工作时，他就要求进行新的工作。我们可以解决您的问题。只要确保您的共享数据结构受互斥锁保护即可。

您的工作是什么？这是（部分解决）数独。有些可能无法解决，有些则可以解决。找到解决方案后，就完成了。

让我们看看将为每个线程调用的工作方法：

#define WORKSIZE 20
void work() {
    while(!finished) {
        int* field;
        field = getWork();
        std::vector<int*> orphans;
        for (int i=0; i < WORKSIZE; ++i) {
            auto pos = findCell(field);
            auto options = getChoices(pos.first, pos.second,field);
            if (options.empty()) {
                delete[] field;
                if (orphans.empty()) {
                    // get new work
                    break;
                } else {
                    field = orphans.pop_back();
                    continue;
                }
            }
            for (int j = 0; j < options.size() - 1; ++j) {
                // store all your other options in orphans as you are not going to try them right now
                int* newfield = new int[N*N];
                std::copy(field, field+N*N, newfield);
                newfield[pos.first*N+pos.second] = options[j];
                orphans.push_back(newfield);
            }
            // work on the last option
            field[pos.first*N+pos.second] = options[options.size()-1];
        }
        // We've now finished our item of work.
        // it's time to share our remaining work with the centralized queue:
        orphans.push_back(field);
        publishWork(orphans);
    }

}

现在，您只需要实现线程安全的getWork和publishWork，使用WORKSIZE进行操作就可以了。

请注意，这不是在没有openmp的情况下如何并行化不平衡工作的唯一选择。我喜欢它。

如何与不可预测的工作并行化？

1 个答案: