我正在尝试使用openMP在c ++中并行化一段代码,但我发现结果比串行版本慢。 我尝试访问我创建的矩阵中的元素,但我不明白为什么这会影响我在下面发布的代码中的速度。
我开始学习openMP,所以我可能没有看到一些明显的错误。
这是代码:
long firstProcessorItem=numberColdStartItem/2;
long secondProcessorItem=numberColdStartItem-firstProcessorItem;
long processorId;
cout<<"ciao"<<endl;
#pragma omp parallel private(processorId) num_threads(2)
{
processorId=omp_get_thread_num();
cout<<processorId<<endl;
auto cpu=sched_getcpu();
cout<<"dsgahrtbw"<<cpu;
if (processorId==0) {
cout<<"aaa"<<endl;
for (int j=1; j<=firstProcessorItem; j++) {
vector<CoupleItem<scalar>> vettoreCopie;
vettoreCopie.resize(numberPastItem);
auto& writeColumn=predictions.getColumnNonConst(j);
vector<scalar>& readColumn=similarityPastFuture.getColumnNonConst(j);
for (int i=numberUsers; i>=1; i--) {
auto& readRow=userRatingPast.getRowNonConst(i);
long int sizeReadRow=readRow.size();
computeCoupleVector(readRow, readColumn, vettoreCopie);
if (sizeReadRow>N) {
std::sort(vettoreCopie.begin(),vettoreCopie.begin()+sizeReadRow,compare);
writeColumn[i-1]=fastScalarProduct(vettoreCopie, N);
} else {
writeColumn[i-1]=fastScalarProduct(vettoreCopie, sizeReadRow);
}
}
}
}
if (processorId==1) {
cout<<"bbb"<<endl;
for (int j=firstProcessorItem+1; j<=numberColdStartItem; j++) {
vector<CoupleItem<scalar>> vettoreCopie;
vettoreCopie.resize(numberPastItem);
auto& writeColumn=predictions.getColumnNonConst(j);
vector<scalar>& readColumn=similarityPastFuture.getColumnNonConst(j);
for (int i=1; i<=numberUsers; i++) {
auto& readRow=userRatingPast.getRowNonConst(i);
long int sizeReadRow=readRow.size();
computeCoupleVector(readRow, readColumn, vettoreCopie);
if (sizeReadRow>N) {
std::sort(vettoreCopie.begin(),vettoreCopie.begin()+sizeReadRow,compare);
writeColumn[i-1]=fastScalarProduct(vettoreCopie, N);
} else {
writeColumn[i-1]=fastScalarProduct(vettoreCopie, sizeReadRow);
}
}
}
}
}
有人知道这种行为吗? 非常感谢你!