Question

我正在尝试实现朴素的线程矩阵乘法，我正在使用手动分配的结果数组为每个线性组合创建多个线程，并写入每个线程上的相应位置，但是我的代码运行速度比单个线程慢线程版本，是否使用内存会使代码变慢？

我使用堆分配来避免任何内存复制，但这可能是问题吗？

#define rows first
#define columns second 

void linear_combination(double const *arr_1,std::pair<int, int> sp_1,
                        double const *arr_2, std::pair<int, int> sp_2,
                        double *arr_3, std::pair<int, int> sp_3,
                        int base_row,int base_col){
    double sum = 0;
    for (int i = 0; i < sp_1.columns; i++){
        int idx_1 = base_row * sp_1.columns + i;
        int idx_2 = i * sp_2.columns + base_col;
        sum += arr_1[idx_1] * arr_2[idx_2];
    }
    int idx_3 = base_row * sp_3.columns + base_col;
    arr_3[idx_3] = sum;
}

auto matmul(double *m1, std::pair<int, int> sp_1, double *m2, std::pair<int, int> sp_2){
    // "sp_n" stands for shape for n-th matrix
    if (sp_1.second == sp_2.first){
        auto *m3 = (double *) malloc(sp_1.first*sp_2.second* sizeof(double));
        std::pair sp_3 = {sp_1.first, sp_2.second};

        for (int k = 0; k < sp_3.rows; k++){
            std::vector<std::thread> thread_list(sp_2.columns);
            for (int j = 0; j < sp_2.columns; j++){
                // will automatically save linear combination sum into m3

                thread_list[j] = ( std::thread(linear_combination,
                                               m1, sp_1,
                                               m2, sp_2,
                                               m3, sp_3,
                                               k, j) );
            }

            // join threads and use calculation
            std::for_each(thread_list.begin(), thread_list.end(), std::mem_fn(&std::thread::join));
        }

        return std::make_tuple(m3, sp_3);
    } else{
        puts("Size mismatch");
        printf("%d %d\n", sp_1.second, sp_2.first);
        double m3 = 0;
        return std::make_tuple(&m3, std::make_pair(0, 0));
    }
}

线程之间共享内存会使它们变慢吗？

0 个答案: