我正在尝试使用OpenMP API(或pthreads)来并行化以下代码。它的时间复杂度是O(n)。
我想知道是否可以在X
块(X
=线程数)中对条目数组进行分区,并为每个人并行执行该过程。
这是一个非常经典的算法问题,到目前为止我还没有看到有人试图实现并行化版本。
重要提示:由于我只是从左到右读取数组,因此简单缩减无法解决此问题。所以并行化并不是那么明显......
#include<stdio.h>
/* The function assumes that there are at least two
elements in array.
The function returns a negative value if the array is
sorted in decreasing order.
Returns 0 if elements are equal */
int maxDiff(int arr[], int arr_size)
{
int max_diff = arr[1] - arr[0];
int min_element = arr[0];
int i;
for(i = 1; i < arr_size; i++)
{
if(arr[i] - min_element > max_diff)
max_diff = arr[i] - min_element;
if(arr[i] < min_element)
min_element = arr[i];
}
return max_diff;
}
答案 0 :(得分:2)
由于数据依赖性和低计算要求,这不太可能在多核中提供更多的加速 - 但是,您可以通过在数组的每个块中计算本地最小值,最大值和本地值来做某事区域最好,然后比较块。由于最后一步,这在O(N)+ O(P 2 )时间运行,进一步限制了可扩展性。
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include <limits.h>
#include <omp.h>
void tick(struct timeval *t);
double tock(const struct timeval * const t);
unsigned int maxDiff(const int * const arr, const int arr_size)
{
int max_diff = arr[1] - arr[0];
int min_element = arr[0];
int i;
for(i = 1; i < arr_size; i++)
{
if(arr[i] - min_element > max_diff)
max_diff = arr[i] - min_element;
if(arr[i] < min_element)
min_element = arr[i];
}
return max_diff;
}
unsigned int ompMaxDiff(const int * const arr, const int arr_size)
{
int nthreads=omp_get_max_threads();
int maxes[nthreads];
int mins [nthreads];
unsigned int best = 0;
for (int i=0; i<nthreads; i++) {
mins [i] = INT_MAX;
maxes[i] = INT_MIN;
}
#pragma omp parallel num_threads(nthreads) default(none) shared(mins, maxes) reduction(max:best)
{
int idx = omp_get_thread_num();
int min = INT_MAX, max = INT_MIN;
#pragma omp for schedule(static)
for(int i=0; i<arr_size; i++) {
if (arr[i] < min) min=arr[i];
if (arr[i] > max) max=arr[i];
if ((arr[i] - min) > best) best = arr[i] - min;
}
mins [idx] = min;
maxes[idx] = max;
}
for (int i=0; i<nthreads-1; i++)
for (int j=i+1; j<nthreads; j++)
if ((maxes[j] - mins[i]) > best) best = maxes[j]-mins[i];
return best;
}
int main(int argc, char **argv) {
const int nitems=1000000;
int *data = malloc(nitems*sizeof(int));
srand(time(NULL));
for (int i=0; i<nitems; i++)
data[i] = rand() % 500; /* numbers between 0 and 500 */
data[(nitems/2)+1] = -700;
data[(nitems/2)] = 700; /* a trick! shouldn't get 1400, */
/* should get <= 1200 */
struct timeval start;
tick(&start);
unsigned int res = maxDiff(data, nitems);
double restime = tock(&start);
printf("Serial: answer = %u, time = %lf\n", res, restime);
tick(&start);
res = ompMaxDiff(data, nitems);
restime = tock(&start);
printf("OpenMP: answer = %u, time = %lf\n", res, restime);
free(data);
return 0;
}
void tick(struct timeval *t) {
gettimeofday(t, NULL);
}
double tock(const struct timeval * const t) {
struct timeval now;
gettimeofday(&now, NULL);
return (double)(now.tv_sec - t->tv_sec) + ((double)(now.tv_usec - t->tv_usec)/1000000.);
}
在8个内核上运行会产生:
$ gcc -fopenmp -O3 -Wall -std=c11 maxdiff.c -o maxdiff
$ ./maxdiff
Serial: answer = 1199, time = 0.001760
OpenMP: answer = 1199, time = 0.000488
答案 1 :(得分:1)
我对OpenMP特别不确定,但是这里有一个关联运算符,可以解决并行问题。
struct intermediate {
int min_elem;
int max_elem;
int max_diff;
};
使用此功能准备单身人士列表。
struct intermediate singleton(int x) {
return (struct intermediate){x, x, INT_MIN};
}
使用此功能组合两个相邻的中间体。
struct intermediate combine(struct intermediate a, struct intermediate b) {
return (struct intermediate){min(a.min_elem, b.min_elem),
max(a.max_elem, b.max_elem),
max(max(a.max_diff, b.max_diff),
b.max_elem - a.min_elem)};
}
可以像这样绘制一种可能的评估策略。
C
/ \
C \
/ \ \
/ \ \
/ \ \
C C \
/ \ / \ \
S S S S S
| | | | |
0 1 2 3 4
此处C
表示合并,S
表示单身。由于combine是关联的,任何二叉树都可以。这是另一种策略。
C
/ \
/ \
/ \
/ C
/ / \
C / C
/ \ / / \
S S S S S
| | | | |
0 1 2 3 4