我正在使用Pthreads学习并行处理。我有一个四核处理器。不幸的是,以下代码的并行化部分运行速度比非并行化代码慢大约5倍。我在这做错了什么?在此先感谢您的帮助。
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#define NTHREADS 4
#define SIZE NTHREADS*10000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
b->sum += b->arr[i];
}
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i != size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]){
clock_t begin, end;
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memory\n");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i != NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
begin = clock();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Parallelized code run time: %f\n", runTime);
/* run the unparallelized code */
begin = clock();
sum2 = arrSum(myArr, SIZE);
end = clock();
runTime = (double)(end - begin)/CLOCKS_PER_SEC;
printf("Unparallelized code run time: %f\n", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d \n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}
答案 0 :(得分:2)
您错过了并行编程的一个重要方面。
每个进程需要创建一次工作线程,而不是每个任务。
创建和销毁线程需要时间。
解决方案是使用线程池并将任务发送到池中。
我的建议是使用 OpenMP ,这大大简化了这项任务,并与许多编译器配合使用。
示例:
TypeError: Cannot read property '0' of undefined at /home/kwon/NODE/TASK/myChat/routes/routes.js:298:37 at callbacks
为了更快地完成这项工作,请进行一些循环展开 - 例如计算单个{ 'Howoldareyou? ': '20', 'Whatisyourfavorite? ': 'sports' }
循环范围内的8个数字的总和。
答案 1 :(得分:2)
主要问题是您正在使用clock()
which does not return the wall time but the cumulative CPU time。这是使用SO的OpenMP标记最常见的错误(如果频率列表在SO上有用,它应该显示这个)。
获得挂起时间的最简单方法是使用OpenMP中的函数:omp_get_wtime()
。这适用于Linux和Windows与GCC,ICC和MSVC(我假设现在支持OpenMP 3.1的Clang)。
当我在你的代码中使用它时,我会使用我的四核/八线程i7 IVB系统:
Parallelized code run time: 0.048492
Unparallelized code run time: 0.115124
sum1, sum2: 400000000, 400000000
其他一些评论。您的日程安排容易出错。您将每个线程的数组设置为
p.arr = myArr + i*(int)(SIZE/NTHREADS);
然后让每个线程都在(SIZE/NTHREADS)
上运行。对于SIZE
和NTHREADS
的某些值的舍入错误,这可能会产生错误的结果。
你应该让每个线程都运行
int start = ithread*SIZE/NTHREADS;
int finish = (ithreads+1)*SIZE/NTHREADS;
然后让每个线程指向数组的开头并执行
int sum = 0;
for (i = start; i < finish; ++i){
sum += b->arr[i];
}
这实际上是OpenMP&#39; schedule(static)
所做的。事实上,通过执行
pthreads
的相同效果
int sum = 0;
#pragma omp parallel for reduction(+:sum)
for (int i = 0; i < size; ++i){
sum += arr[i];
}
这是我使用的代码
//gcc -O3 -std=gnu99 t.c -lpthread -fopenmp
#include <stdio.h>
#include <time.h>
#include <pthread.h>
#include <stdlib.h>
#include <omp.h>
#define NTHREADS 4
#define SIZE NTHREADS*100000000
struct params {
int * arr;
int sum;
};
/* The worker function for the pthreads */
void * myFun (void * x){
int i;
struct params * b = (struct params *) x;
int sum = 0;
for (i = 0; i < (int)(SIZE/NTHREADS); ++i){
sum += b->arr[i];
}
b->sum = sum;
return NULL;
}
/* unparallelized summing function*/
int arrSum(int * arr, int size){
int sum = 0;
for (int i = 0; i < size; ++i){
sum += arr[i];
}
return sum;
}
int main(int argc, char * argv[]) {
double runTime;
int rc, i;
int sum1, sum2 = 0;
pthread_t threads[NTHREADS];
/* create array to sum over */
int * myArr = NULL;
myArr = (int *) calloc(SIZE, sizeof(int));
if (myArr == NULL){
printf("problem allocating memory\n");
return 1;
}
for (int i = 0; i < SIZE; ++i){
myArr[i] = 1;
}
/* create array of params structs to feed to threads */
struct params p;
p.sum = 0;
struct params inputs[NTHREADS];
for(i = 0; i < NTHREADS; ++i){
p.arr = myArr + i*(int)(SIZE/NTHREADS);
inputs[i] = p;
}
/* spawn the threads */
runTime = -omp_get_wtime();
for(i = 0; i != NTHREADS; ++i){
rc = pthread_create(&threads[i], NULL, myFun, (void *) &inputs[i]);
}
/* wait for threads to finish */
for(i = 0; i != NTHREADS; ++i){
rc = pthread_join(threads[i], NULL);
}
runTime += omp_get_wtime();
printf("Parallelized code run time: %f\n", runTime);
/* run the unparallelized code */
runTime = -omp_get_wtime();
sum2 = arrSum(myArr, SIZE);
runTime += omp_get_wtime();
printf("Unparallelized code run time: %f\n", runTime);
/* consolidate and print results from threads */
for(i = 0; i != NTHREADS; ++i){
sum1 += inputs[i].sum;
}
printf("sum1, sum2: %d, %d \n", sum1, sum2);
free(myArr);
/* be disappointed when my parallelized code showed no speedup */
return 1;
}