我的openMP代码出了什么问题?它总是只需要1个线程,并且与非并行版本
同时工作template <typename T>
Matrix<T>* Matrix<T>::OMPMultiplication(Matrix<T>* A, Matrix<T>* B){
if(A->ySize != B->xSize)
throw;
Matrix<T>* C = new Matrix<T>(A->xSize, B->ySize);
sizeType i, j, k;
T element;
#pragma omp parallel for private(i, j)
{
#pragma omp for private(i, j)
for( i = 0; i < A->xSize; i++ )
cout<<"There are "<<omp_get_num_threads()<<" threads"<<endl;
for(j = 0; j < B->ySize; j++){
C->matrix[i][j] = 0;
for(k = 0; k < A->ySize; k++){
C->matrix[i][j] += A->matrix[i][k] * B->matrix[k][j];
}
}
}
return C;
}
答案 0 :(得分:1)
首先,您遗漏了{}
循环的某些i
,并且变量k
需要对i
循环的每次迭代设置为私有。但是,我认为您还混淆了parallel
和for
pragma的组合方式。要成功并行化for循环,需要将其放在parallel
pragma中,然后放在for
pragma中。为此,您可以将代码更改为
#pragma omp parallel private(i, j, k)
{
#pragma omp for
for( i = 0; i < A->xSize; i++ ) {
cout<<"There are "<<omp_get_num_threads()<<" threads"<<endl;
for(j = 0; j < B->ySize; j++) {
C->matrix[i][j] = 0;
for(k = 0; k < A->ySize; k++){
C->matrix[i][j] += A->matrix[i][k] * B->matrix[k][j];
}
}
}
}
或使用合并的parallel for
表示法
#pragma omp parallel for private(i, j, k)
for( i = 0; i < A->xSize; i++ ) {
...
}
另外,请确保告诉OpenMP在此处使用多个线程。这可以通过omp_set_num_threads(<number of threads here>)
和设置OMP_NUM_THREADS
等环境变量来完成。
希望你能并行化。 :)
答案 1 :(得分:1)
使用此代码,我的4核心得到的结果略快一些:
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
完整计划
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>
int main() {
int i, j, n, a[719][719], b[719], c[719];
clock_t start = clock();
n = 100; //Max 719
printf("Matrix A\n");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix B\n");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d\n", b[i]);
}
}
printf("\nA * B\n");
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
c[i] = 0;
}
}
#pragma omp parallel private(i,j) shared(n,a,b,c)
{
#pragma omp for schedule(dynamic)
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
c[i] += b[j] * a[j][i];
}
}
}
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
printf("%d\n", c[i]);
}
}
clock_t stop = clock();
double elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf("\nTime elapsed: %.5f\n", elapsed);
start = clock();
printf("Matrix A\n");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix B\n");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d\n", b[i]);
}
}
printf("\nA * B\n");
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
stop = clock();
elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf("\nTime elapsed: %.5f\n", elapsed);
return 0;
}
第一种方法需要
已过去的时间:0.03442
第二种方法
已过去的时间:0.02630