如何使用OpenMP(C)优化中值过滤器?

时间:2018-01-24 04:53:09

标签: c opencv image-processing parallel-processing openmp

我编写了一个程序,它有两个版本的中值过滤器,在C中使用OpenCV实现,一个是顺序的,另一个是用OpenMP并行化的。我的问题在于,OpenMP版本的运行速度似乎比顺序版本慢,无论是块大小还是线程数

非常欢迎任何想法/建议!

这是我的顺序代码

void medianFilter (const IplImage* img){
  IplImage* output = cvCloneImage(img);
  int rows, cols, step;
  uchar *data;

  rows = output->height;
  cols = output->width;
  step = output->widthStep;
  data = (uchar *)output->imageData;

  if(!data)
  { return; }

  //create a sliding window of size 9
  int window[9];

  for(int y = 1; y < rows - 1; y++){
      for(int x = 1; x < cols - 1; x++){

          // Pick up window element
          window[0] = data[(y - 1) * step + (x - 1)];
          window[1] = data[y * step + (x - 1)];
          window[2] = data[(y + 1) * step + (x - 1)];
          window[3] = data[(y - 1) * step + x];
          window[4] = data[y * step + x];
          window[5] = data[(y + 1) * step + x];
          window[6] = data[(y - 1) * step + (x + 1)];
          window[7] = data[y * step + (x + 1)];
          window[8] = data[(y + 1) * step + (x + 1)];

          // Sort the window to find median
          insertionSort(window);

          // Assign the median to centered element of the matrix
          data[y * step + x] = window[4];
      }
  }

  cvNamedWindow("Post-filter", CV_WINDOW_AUTOSIZE);
  cvShowImage("Post-filter", output);
  cvReleaseImage(&output);
  }

这是我的并行化代码

  void omp_medianFilter (const IplImage* img){
  IplImage* output = cvCloneImage(img);
  int rows, cols, step, nthreads;
  uchar *data;

  rows = output->height;
  cols = output->width;
  step = output->widthStep;
  data = (uchar *)output->imageData;

  if(!data)
  { return; }

  // Create a sliding window of size 9
  int window[9], x, y;

  // Set the number of threads to use
  omp_set_num_threads(NUM_THREADS);

  // Parallel code segment. Window, x and y are private variables for each thread
  #pragma omp parallel private(window, x, y)
  {
    //if(omp_get_thread_num() == 0){
      //nthreads = omp_get_num_threads();
      //printf("Numer of threads running: %d \n", nthreads);
    //}

    // Parallel for loop with dynamic scheduling and collapsing nested loops
    #pragma omp for schedule(dynamic, CHUNK) collapse(2)
      for(y = 1; y < rows - 1; y++){
          for(x = 1; x < cols - 1; x++){

              // Pick up 3x3 window elements
              window[0] = data[(y - 1) * step + (x - 1)];
              window[1] = data[y * step + (x - 1)];
              window[2] = data[(y + 1) * step + (x - 1)];
              window[3] = data[(y - 1) * step + x];
              window[4] = data[y * step + x];
              window[5] = data[(y + 1) * step + x];
              window[6] = data[(y - 1) * step + (x + 1)];
              window[7] = data[y * step + (x + 1)];
              window[8] = data[(y + 1) * step + (x + 1)];

              // Sort the window to find median
              insertionSort(window);

              // Assign the median to centered element of the matrix
              data[y * step + x] = window[4];
          }
      }
    }

  cvNamedWindow("Post-filter (OMP)", CV_WINDOW_AUTOSIZE);
  cvShowImage("Post-filter (OMP)", output);
  cvReleaseImage(&output);
  }

完整代码

#include <stdio.h>
#include <opencv2/imgproc/imgproc_c.h>
#include <opencv2/highgui/highgui_c.h>
#include <opencv2/core/types_c.h>
#include <sys/time.h>
#include <omp.h>

#define NUM_THREADS 8
#define CHUNK 15000

//Function to measure time
double get_walltime() {   
  struct timeval tp; gettimeofday(&tp, NULL);
  return (double) (tp.tv_sec + tp.tv_usec*1e-6);
}

//Sort the window elements using insertion sort
void insertionSort(int window[])
{
    int temp, i , j;

        for(i = 0; i < 9; i++){
            temp = window[i];
            for(j = i-1; j >= 0 && temp < window[j]; j--){
            window[j+1] = window[j];
        }
        window[j+1] = temp;
    }
}

void medianFilter (const IplImage* img){
      IplImage* output = cvCloneImage(img);
      int rows, cols, step;
      uchar *data;

      rows = output->height;
      cols = output->width;
      step = output->widthStep;
      data = (uchar *)output->imageData;

      if(!data)
      { return; }

      //create a sliding window of size 9
      int window[9];

      for(int y = 1; y < rows - 1; y++){
          for(int x = 1; x < cols - 1; x++){

              // Pick up window element
              window[0] = data[(y - 1) * step + (x - 1)];
              window[1] = data[y * step + (x - 1)];
              window[2] = data[(y + 1) * step + (x - 1)];
              window[3] = data[(y - 1) * step + x];
              window[4] = data[y * step + x];
              window[5] = data[(y + 1) * step + x];
              window[6] = data[(y - 1) * step + (x + 1)];
              window[7] = data[y * step + (x + 1)];
              window[8] = data[(y + 1) * step + (x + 1)];

              // Sort the window to find median
              insertionSort(window);

              // Assign the median to centered element of the matrix
              data[y * step + x] = window[4];
          }
      }

      cvNamedWindow("Post-filter", CV_WINDOW_AUTOSIZE);
      cvShowImage("Post-filter", output);
      cvReleaseImage(&output);
    }

// Parallelized implementation of median filter
void omp_medianFilter (const IplImage* img){
      IplImage* output = cvCloneImage(img);
      int rows, cols, step, nthreads;
      uchar *data;

      rows = output->height;
      cols = output->width;
      step = output->widthStep;
      data = (uchar *)output->imageData;

      if(!data)
      { return; }

      // Create a sliding window of size 9
      int window[9], x, y, j, k, min;

      // Set the number of threads to use
      omp_set_num_threads(NUM_THREADS);

      // Parallel code segment. Window, x and y are private variables for each thread
      #pragma omp parallel private(window, x, y, j, k, min)
      {
        //if(omp_get_thread_num() == 0){
          //nthreads = omp_get_num_threads();
          //printf("Numer of threads running: %d \n", nthreads);
        //}

        // Parallel for loop with dynamic scheduling and collapsing nested loops
        #pragma omp for schedule(dynamic, CHUNK) collapse(2)
          for(y = 1; y < rows - 1; y++){
              for(x = 1; x < cols - 1; x++){

                  // Pick up 3x3 window elements
                  window[0] = data[(y - 1) * step + (x - 1)];
                  window[1] = data[y * step + (x - 1)];
                  window[2] = data[(y + 1) * step + (x - 1)];
                  window[3] = data[(y - 1) * step + x];
                  window[4] = data[y * step + x];
                  window[5] = data[(y + 1) * step + x];
                  window[6] = data[(y - 1) * step + (x + 1)];
                  window[7] = data[y * step + (x + 1)];
                  window[8] = data[(y + 1) * step + (x + 1)];

                  // Sort the window to find median
                  //insertionSort(window);
                  for (int j = 0; j < 5; ++j)
                  {
                     //   Find position of minimum element
                     int min = j;
                     for (int l = j + 1; l < 9; ++l)
                       if (window[l] < window[min])
                          min = l;
                     //   Put found minimum element in its place
                     const int temp = window[j];
                     window[j] = window[min];
                     window[min] = temp;
                  }


                  // Assign the median to centered element of the matrix
                  data[y * step + x] = window[4];
              }
          }
        }

      cvNamedWindow("Post-filter (OMP)", CV_WINDOW_AUTOSIZE);
      cvShowImage("Post-filter (OMP)", output);
      cvReleaseImage(&output);
      }

  int main(int argc, char *argv[])
  {
  IplImage* src;
  double time1, time2;

  if(argc<2){
    printf("Usage: main <image-file-name>\n\7");
    exit(0);
  }

  // Load a source image
  src = cvLoadImage(argv[1], CV_LOAD_IMAGE_GRAYSCALE);
  cvNamedWindow("Original", CV_WINDOW_AUTOSIZE);
  cvShowImage("Original", src);

  /*time1 = get_walltime();
  medianFilter(src);
  time2 = get_walltime();
  printf("Sequential Code Performance: %fs\n", time2 - time1);*/

  time1 = get_walltime();
  omp_medianFilter(src);
  time2 = get_walltime();
  printf("Parallel Code Performance: %fs\n", time2 - time1);

  cvWaitKey(0);
  cvReleaseImage(&src);

  return 0;
  }

1 个答案:

答案 0 :(得分:0)

<强>固定

我确实应用了很多给出的建议,但我确实看到了性能提升,但提到的事情并不是问题。

原来这是非常愚蠢的事情。我在使用Ubuntu 16.04的VM上运行它,我不小心忘记增加内核数量,因此它只使用1,这可能意味着它根本没有并行化。