我试图在Karatsuba的多项式乘法中并行化递归。但它比没有线程更慢。什么问题 我有这段代码:
int karatsubaMain(int size)
{
Polynom pol1(size),pol2(size);
omp_set_num_threads(8);
double start = omp_get_wtime();
int* result = mult(pol1.polynom,pol2.polynom,0,pol1.size);
double end = omp_get_wtime();
printf("%f ", end - start);
return 0;
}
int * mult(int*a, int *b,int start, int N){
int * c= new int[2*N-1];
if(N==1){
c[0]=a[start]*b[start];
return c;
}
int * t1= new int[N/2];
int * t2= new int[N/2];
int * cM,*cL,*cH;
for(int i=0;i<N/2;i++){
t1[i]=a[start +i]+a[start + i + N/2];
t2[i]=b[start +i]+b[start + i + N/2 ];
}
#pragma omp parallel shared(cM,cL,cH)
{
#pragma omp single nowait
{
#pragma omp task if(N > 4096)
cM=mult(t1,t2,0,N/2);
#pragma omp task if(N > 4096)
cL=mult(a,b,0,N/2);
#pragma omp task if(N > 4096)
cH=mult(a,b,N/2,N/2);
}
#pragma omp taskwait
}
c[N-1]=0;
for(int i=0;i<N-1;i++){
c[i]=cL[i];
c[N+i]=cH[i];
c[N/2+i]+=(cM[i]-(cL[i]+cH[i]));
}
delete []t1;
delete []t2;
delete []cM;
delete []cL;
delete []cH;
return c;
}
答案 0 :(得分:1)
首先,我告诉你你做了什么,你更了解变化:
在每个步骤中执行此操作:
#pragma omp parallel shared(cM,cL,cH) //open a new parallel region (ie create threads)
{
#pragma omp single nowait //only one thread should do the following
{
#pragma omp task if(N > 4096) //create task
cM=mult(t1,t2,0,N/2);
#pragma omp task if(N > 4096) //create task
cL=mult(a,b,0,N/2);
#pragma omp task if(N > 4096) //create task
cH=mult(a,b,N/2,N/2);
} //after this line all threads are working on the same
#pragma omp taskwait //before executing further the tasks should be finished
} // close all threads created at this parallel
你打算做什么:
创建一些线程,一旦创建了递归的根,每次递归调用都是一个任务,所有应该处理任务,当所有子任务都完成后计算结果,接受新任务
在karatsubaMain()
中你应该创建线程,然后一个线程插入根任务:
double start = omp_get_wtime();
int* result;
#pragma omp parallel shared(result, a, b, size)
{
#pragma omp single //also #pragma omp master usable here
result = mult(a, b, 0, size);
}
double end = omp_get_wtime();
在mult()
中,您只需创建任务,因为该区域已由不同的线程并行处理:
for(int i = 0; i < N / 2; i++)
{
t1[i] = a[start + i] + a[start + i + N / 2];
t2[i] = b[start + i] + b[start + i + N / 2 ];
}
#pragma omp task shared(cM) if(N > 4096)
cM = mult(t1, t2, 0, N / 2);
#pragma omp task shared(cL) if(N > 4096)
cL = mult(a, b, 0, N / 2);
#pragma omp task shared(cH) if(N > 4096)
cH = mult(a, b, N / 2, N / 2);
#pragma omp taskwait
c[N - 1] = 0;
通过这种方式,我能够将代码的简化版本(多项式的int-array insead)相对于顺序代码加速约15%。
一般评论:大多数时候不建议使用嵌套的平行区域