Question

我在使用OpenMP编写一些并行c代码时遇到了并发问题。

Heres a snippet

#include <stdio.h>
#include <time.h>
#include <math.h>

#define FALSE 0
#define TRUE 1

int count_primes_0(int);
int count_primes_1(int);
int count_primes_2(int);

int main(int argc, char *argv[]){
    int n;

    if (argc != 2){
        printf("Incorrect Invocation, use: \nq1 N");
        return 0;
    } else {
        n = atoi(argv[1]);  
    }

    if (n < 0){
        printf("N cannot be negative");
        return 0;
    }

    printf("N = %d\n", n);

    //omp_set_num_threads(1);
    time_it(count_primes_0, n, "Method 0");
    time_it(count_primes_1, n, "Method 1");
    time_it(count_primes_2, n, "Method 2");

    return 0;
}

int is_prime(int n){
    for(int i = 2; i <= (int)(sqrt((double) n)); i++){
        if ((n % i) == 0){
            return FALSE;
        }
    }

    return n > 1;
}

void time_it( int (*f)(int), int n, char *string){
    clock_t start_clock;
    clock_t end_clock;
    double calc_time;
    int nprimes;

    struct timeval start_val;
    struct timeval end_val;

    start_clock = clock();
    nprimes = (*f)(n);
    end_clock = clock();
    calc_time = ((double)end_clock - (double)start_clock) / CLOCKS_PER_SEC;
    printf("\tNumber of primes: %d \t Time taken: %fs\n\n", nprimes, calc_time);
}

// METHOD 0
// Base Case no parallelization
int count_primes_0(int n){
    int nprimes = 0;

    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
            nprimes++;
        }
    }

    return nprimes;
}

//METHOD 1
// Use only For and Critical Constructs
int count_primes_1(int n){
    int nprimes = 0;

    #pragma omp parallel for
    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
            #pragma omp critical
            nprimes++;
        }
    }

    return nprimes;
}

//METHOD 2
// Use Reduction
int count_primes_2(int n){
    int nprimes = 0;

    #pragma omp parallel for reduction(+:nprimes)
    for(int i = 1; i <= n; i++){
        if (is_prime(i)) {
           nprimes++;
        }
    }

    return nprimes;
}

我面临的问题是，当我使用omp_set_num_threads（）时，我使用的线程越少我的函数运行得越快 - 或者越接近基本非平行大小写的运行时间

时间结果：这些是在8核机器上运行的

8个主题：方法0：0.07s;方法1：1.63s;方法2：1.4s

4个主题：方法0：0.07s;方法1：0.16s;方法2：0.16s

2个帖子：方法0：0.07s;方法1：0.10;方法2：0.09

1个主题：方法0：0.07s;方法1：0.08s;方法2：0.07s

我尝试过禁用优化并使用不同的gcc版本

感谢任何帮助。

编辑：在Linux中使用clock会返回'错误'时间，挂钟时间就是我所需要的，因此使用ether omp_get_wtime（）或Linux函数timeit会产生正确的结果。

Answer 1

我很惊讶您已经看到该计划取得了成功，因为它已经在上面了。如果您查看RedHat Linux手册页中的clock（），您会看到它“返回程序使用的处理器时间的近似值”。放入OpenMP指令会导致更多开销，因此在运行OpenMP时应该会看到更多的处理器总时间。您需要注意的是经过时间（或挂钟时间）。当你并行运行（并且你有一个可以从并行中受益的程序）时，你会看到经过的时间。 OpenMP规范定义了一个例程（omp_get_wtime（））来提供这些信息。

使用clock（）和omp_get_wtime（）更改您的程序报告：

$ a.out 1000000（1,000,000）

2个处理器：

clock（）：0.23 wtime（）：0.23 clock（）：0.96 wtime（）：0.16 clock（）：0.59 wtime（）：0.09

4个处理器：

clock（）：0.24 wtime（）：0.24 clock（）：0.97 wtime（）：0.16 clock（）：0.57 wtime（）：0.09

8个处理器：

clock（）：0.24 wtime（）：0.24 clock（）：2.60 wtime（）：0.26 clock（）：0.64 wtime（）：0.09

$ a.out 10000000（10,000,000）

2个处理器：

clock（）：6.07 wtime（）：6.07 clock（）：10.4 wtime（）：1.78 clock（）：11.3 wtime（）：1.65

4个处理器：

clock（）：6.07 wtime（）：6.07 clock（）：11.5 wtime（）：1.71 clock（）：10.7 wtime（）：1.72

8个处理器：

clock（）：6.07 wtime（）：6.07 clock（）：9.92 wtime（）：1.83 clock（）：11.9 wtime（）：1.86

Answer 2

OpenMP不会将循环与其中的函数调用并行化，除非参数是私有的。解决方案是在循环中内联is_prime()。

Openmp基本并行化

2 个答案: