我有一个矩阵,称其为small_matrix,该矩阵由大约100000行和128列存储为单个数组(我将其用于CUDA计算,因此需要节省的空间)。我有一个更大的矩阵,称为large_matrix,行数是10倍,行长与small_matrix相同,我想用small_matrix中的行填充行。但是,人口过程不是1:1。有一个map数组将large_matrix中的每一行映射到small_matrix中的一行。 small_matrix中的单行可以被large_matrix中的多行映射到。我们可以假设map数组是随机生成的。另外,large_matrix中的行很有可能会具有随机值,而不是实际值(假设为1%)。

我正在尝试通过在C ++上使用OMP的并行性来优化此过程,但我似乎无法做到。到目前为止,我尝试过的所有操作都只会导致增加具有更多线程的运行时而不是减少运行时间。这是问题的代码,我正在尝试优化expand_matrix:

#include <stdio.h>
#include <omp.h>
#include <random>
#include <stdlib.h>
#include <cstddef>
#include <ctime>
#include <cstring>
using namespace std;

inline void* aligned_malloc(size_t size, size_t align){
    void *result;
    #ifdef _MSC_VER 
    result = _aligned_malloc(size, align);
     if(posix_memalign(&result, align, size)) result = 0;
    return result;
inline void aligned_free(void *ptr) {
    #ifdef _MSC_VER 


void expand_matrix(int num_rows_in_large_matrix, int row_length, long long* map,  float*small_matrix, float* large_matrix, const int num_threads);

int main(){
    int row_length = 128;
    long long small_matrix_rows = 100000;
    long long large_matrix_rows = 1000000;
    long long *map = new long long [large_matrix_rows]; 
    float *small_matrix = (float*)aligned_malloc(small_matrix_rows*128*sizeof(float), 128);
    float *large_matrix = (float*)aligned_malloc(large_matrix_rows*128*sizeof(float), 128);

    minstd_rand gen(std::random_device{}()); //NOTE: Valgrind will give an error saying: vex amd64->IR: unhandled instruction bytes: 0xF 0xC7 0xF0 0x89 0x6 0xF 0x42 0xC1 :: look: https://bugs.launchpad.net/ubuntu/+source/valgrind/+bug/
    uniform_real_distribution<double> values_dist(0, 1);
    uniform_int_distribution<long long> map_dist(0,small_matrix_rows);
    for (long long i = 0; i<small_matrix_rows*row_length;i++){
        small_matrix[i] = values_dist(gen)-0.5;
    for (long long i=0; i<large_matrix_rows;i++){
        if (values_dist(gen)<0.99)
            map[i] = map_dist(gen);
    clock_t start, end;
    int num_threads =4;
    printf("Populated matrix and generated map\n");
    start = clock();
    expand_matrix(large_matrix_rows, row_length, map, small_matrix, large_matrix, num_threads);
    end = clock();
    printf("Time to expand using %d threads = %f\n", num_threads, double(end-start)/CLOCKS_PER_SEC);
    return 0;

void expand_matrix(int num_rows_in_large_matrix, int row_length, long long* map,  float*small_matrix, float* large_matrix, const int num_threads){

  #pragma omp parallel num_threads(num_threads)
    #pragma omp for schedule(guided, 4) 
    for(unsigned int i = 0; i < num_rows_in_large_matrix; i++ ){
      long long sml = map[i];
      if(sml == -1){
        for (int j = 0; j < row_length; j++)
          large_matrix[i * row_length + j] = 0.5;
        memcpy(large_matrix+i*row_length, small_matrix+sml*row_length, row_length*sizeof(float));


Time to expand using 1 threads = 0.402949
Time to expand using 2 threads = 0.530361
Time to expand using 4 threads = 0.608085
Time to expand using 8 threads = 0.667806
Time to expand using 16 threads = 0.999886



CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
L1d cache:             32K
L1i cache:             32K
L2 cache:              256K
L3 cache:              20480K

使用Ubuntu 16.04和gcc版本5.5.0 20171010(Ubuntu 5.5.0-12ubuntu1〜16.04)。

clock()测量的CPU时间随着您添加的CPU数量而增加。 omp_get_wtime()衡量您希望减少的挂钟时间


1 thread = 0.423516
4 threads = 0.152680 
8 threads = 0.090841
16 threads = 0.064748


    clock_t start, end;
    start = clock();
    expand_matrix(large_matrix_rows, row_length, map, small_matrix, large_matrix, num_threads);
    end = clock();
    printf("Total time %f\n", double(end-start)/CLOCKS_PER_SEC);


    double start, end;
    start = omp_get_wtime();
    expand_matrix(large_matrix_rows, row_length, map, small_matrix, large_matrix, num_threads);
    end = omp_get_wtime();
    printf("Total time %f\n", end-start);