Question

我正在通过将模板梯度信息移动到整个目标的梯度图像上（每次旋转（-60至60））来实现模式匹配算法。我已经保存了每次旋转的模板信息，即已经对121个模板进行了预处理和保存。

但是问题是，这消耗大量时间（大约110毫秒），因此决定将一组旋转（-60至-30，-30至0、0至30和30至60）的匹配项拆分为4个线程，但是线程处理比单个进程花费更多的时间（大约115ms至120ms）。

代码段是...


#define MAXTARGETNUM    64
MatchResultA totalResultsTemp[MAXTARGETNUM];



void CShapeMatch::match(ShapeInfo *ShapeInfoVec, search_region SearchRegion, float MinScore, float Greediness, int width,int height, int16_t  *pBufGradX ,int16_t  *pBufGradY,float  *pBufMag,  bool corr)
{
  MatchResultA resultsPerDeg[MAXTARGETNUM];
....
....
    int startX =  SearchRegion.StartX;
    int startY =  SearchRegion.StartY;
    int endX   =  SearchRegion.EndX;
    int endY   =  SearchRegion.EndY;
    
    float AngleStep  = SearchRegion.AngleStep;
    float AngleStart = SearchRegion.AngleStart;
    float AngleStop = SearchRegion.AngleStop;

    int startIndex = (int)(ShapeInfoVec[0].AngleNum/2) + ShapeInfoVec[0].AngleNum%2+(int)AngleStart/AngleStep;
    int stopIndex = (int)(ShapeInfoVec[0].AngleNum/2) + ShapeInfoVec[0].AngleNum%2+(int)AngleStop/AngleStep;

for (int k = startIndex; k < stopIndex ; k++){
         .... 
         for(int j = startY; j < endY; j++){
            for(int i = startX; i < endX; i++){
                
                    for(int m = 0; m < ShapeInfoVec[k].NoOfCordinates; m++)
                    {
                        curX = i + (ShapeInfoVec[k].Coordinates + m)->x;        // template X coordinate
                        curY = j + (ShapeInfoVec[k].Coordinates + m)->y ;       // template Y coordinate
                        
                        iTx = *(ShapeInfoVec[k].EdgeDerivativeX + m);           // template X derivative
                        iTy = *(ShapeInfoVec[k].EdgeDerivativeY + m);           // template Y derivative
                        iTm   = *(ShapeInfoVec[k].EdgeMagnitude + m);           // template gradients magnitude
                        
                        if(curX < 0 ||curY < 0||curX > width-1 ||curY > height-1)
                            continue;
                        offSet = curY*width + curX;
                        iSx = *(pBufGradX + offSet);            // get corresponding  X derivative from source image
                        iSy = *(pBufGradY + offSet);            // get corresponding  Y derivative from source image
                        iSm = *(pBufMag   + offSet);

                        if (PartialScore > MinScore)
                    {   
                    
                        float Angle = ShapeInfoVec[k].Angel;
                        bool hasFlag = false;
                        for(int n = 0; n < resultsNumPerDegree; n++)
                        {       
                            if(abs(resultsPerDeg[n].CenterLocX - i) < 5 && abs(resultsPerDeg[n].CenterLocY - j) < 5)
                            {   
                                hasFlag = true;
                                if(resultsPerDeg[n].ResultScore < PartialScore)
                                {   
                                    resultsPerDeg[n].Angel = Angle;
                                    resultsPerDeg[n].CenterLocX = i;
                                    resultsPerDeg[n].CenterLocY = j;
                                    resultsPerDeg[n].ResultScore = PartialScore;
                                    
                                    break;
                                }
                            }
                        }
                        if(!hasFlag)
                        {   
                            resultsPerDeg[resultsNumPerDegree].Angel = Angle;
                            resultsPerDeg[resultsNumPerDegree].CenterLocX = i;
                            resultsPerDeg[resultsNumPerDegree].CenterLocY = j;
                            resultsPerDeg[resultsNumPerDegree].ResultScore = PartialScore;
    
                            resultsNumPerDegree ++;
                        }
                        minScoreTemp = minScoreTemp < PartialScore ? PartialScore : minScoreTemp;   
                    }
                }
            }
            
            
            
            for(int i = 0; i < resultsNumPerDegree; i++)
                {
                    mtx.lock();
                    totalResultsTemp[totalResultsNum] = resultsPerDeg[i];
                    totalResultsNum++;
                    mtx.unlock();
                }
        
            n++;
}

void CallerFunction(){
            int16_t  *pBufGradX   = (int16_t *) malloc(bufferSize * sizeof(int16_t));
            int16_t  *pBufGradY   = (int16_t *) malloc(bufferSize * sizeof(int16_t));
            float    *pBufMag     = (float *) malloc(bufferSize * sizeof(float));
          
          clock_t start = clock();

          float temp_stop = SearchRegion->AngleStop;

            SearchRegion->AngleStop = -30;
            thread t1(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,  width, height, pBufGradX ,pBufGradY,pBufMag, corr);

            SearchRegion->AngleStart = -30;
            SearchRegion->AngleStop=0;
            thread t2(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,  width, height, pBufGradX ,pBufGradY,pBufMag, corr);            

            SearchRegion->AngleStart = 0;
            SearchRegion->AngleStop=30;
            thread t3(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,width, height, pBufGradX ,pBufGradY,pBufMag, corr);          

            SearchRegion->AngleStart = 30;
            SearchRegion->AngleStop=temp_stop;
            thread t4(&CShapeMatch::match, this, ShapeInfoVec, *SearchRegion, MinScore, Greediness,width, height, pBufGradX ,pBufGradY,pBufMag, corr);

            t1.join();
            t2.join();
            t3.join();
            t4.join();
            
            clock_t end = clock();
            cout  << 1000*(double)(end-start)/CLOCKS_PER_SEC << endl;
}

我们可以看到有很多堆访问权限，但是它们只是只读的。只有totalResultTemp和totalResultNum是共享的全局资源，在其上执行写操作。

我的PC配置是 i5-7200U CPU @ 2.50GHz 4 cores 4 Gig RAM Ubuntu 18

Answer 1

for(int i = 0; i < resultsNumPerDegree; i++)
                {
                    mtx.lock();
                    totalResultsTemp[totalResultsNum] = resultsPerDeg[i];
                    totalResultsNum++;
                    mtx.unlock();
                }

您写入静态数组，互斥锁实际上是time consuming。与其创建锁，不如尝试使用std::atomic_int，或者我认为更好的方法是，将其传递给存储结果的确切位置，所以同步问题不再是您的问题了

Answer 2

c / c ++中的POSIX线程不是并发的，因为操作系统必须将分配给每个父进程的时间分成其拥有的线程数。因此，您的算法仅执行核心。要利用多核技术，必须使用OpenMP。该接口库使您可以将算法拆分为不同的物理核心。这是一个很好的OpenMP tutorial

多线程比单个进程花费更多的时间

2 个答案: