Question

我遇到了问题和疑问。我试着用omp做一些矩阵乘法。

如果我使用多个线程创建矩阵a，b和c，则列大小不等。即使我对push_back使用critical也存在问题。我认为omp将for循环划分为相同大小的部分，因此每个线程都应该有自己的列。这个问题在吗？

给每个线程一个向量的好方法是什么？什么是避免没有关键和原子的共享内存问题的好方法，例如如果我正在生成数据并希望将其保存在某个地方。谢谢。附：我正在研究我的英语。它远非完美，所以请不要介意。

#include "stdafx.h"
#include <omp.h>
#include <iostream>
#include <ctime>
#include <vector>

#define NRA 300                /* number of rows in matrix A */
#define NCA 300              /* number of columns in matrix A */
#define NCB 300                  /* number of columns in matrix B */

int main(int argc, char *argv[])
{
    int i, j, k, chunk;

    std::vector < std::vector<int> > a;
    a.resize(NRA);
    std::vector < std::vector<int> > b;
    b.resize(NCA);
    std::vector < std::vector<int> > c;
    c.resize(NRA);
    /*
    double a[NRA][NCA];
    double b[NCA][NCB];
    double c[NRA][NCB];
    */
    chunk = 10;                    

    std::clock_t start;         //Zeitmessung
    double duration;            //Zeitdauer der Parallelisierung


        omp_set_num_threads(4);
#pragma omp parallel
        {
#pragma omp for schedule (static, chunk) 
            for (i = 0; i < NRA; i++)
                for (j = 0; j < NCA; j++)
                    a[i].push_back(i + j);
#pragma omp for schedule (static, chunk)
            for (i = 0; i < NCA; i++)
                for (j = 0; j < NCB; j++)
                    b[i].push_back(i*j);
#pragma omp for ordered schedule(static, chunk)
            for (i = 0; i < NRA; i++)
                for (j = 0; j < NCB; j++)
                    c[i].push_back(0);
        }

    for (int nthreads = 1; nthreads < 40; nthreads++)
    {
        start = std::clock();
        omp_set_dynamic(0);
#pragma omp parallel shared(a,b,c,nthreads,chunk) private(i,j,k) num_threads(nthreads)
    {

#pragma omp for schedule (static, chunk)
        for ( i = 0; i < NRA; i++)
            for (j = 0; j < NCB; j++)
                c[i][j] = 0;

#pragma omp for ordered schedule (static, chunk) 
        for (i = 0; i < NRA; i++)
        {
            for ( j = 0; j < NCB; j++)
                for (k = 0; k < NCA; k++)
                    c[i][j] += a[i][k] * b[k][j];
        }
    }   

        duration = (std::clock() - start) / (double)CLOCKS_PER_SEC;
        //Time n threads need
        std::cout << "Benoetigte Zeit fuer " << nthreads << " Threads betrug " << duration << " Sekunden." << std::endl;
    }

    std::cin.get();

}

Answer 1

push_back()肯定会修改矢量的元数据，尤其是大小。尝试resize()内部向量，就像使用外部向量（a，b，c）一样，然后修改元素（a[i] = i + j;等。并行运行。

由于你知道开头的元素的最终计数，你可以使用普通数组而不是向量来最小化开销。

int a[NRA][NCA];
int b[NCA][NCB];
int c[NRA][NCB];

我想知道为什么你已经注释掉代码的类似部分。 ; - ）

用omp生成矩阵会导致麻烦，不同的columsizes

1 个答案: