我正在尝试实现朴素的线程矩阵乘法,我正在使用手动分配的结果数组为每个线性组合创建多个线程,并写入每个线程上的相应位置,但是我的代码运行速度比单个线程慢线程版本,是否使用内存会使代码变慢?
我使用堆分配来避免任何内存复制,但这可能是问题吗?
#define rows first
#define columns second
void linear_combination(double const *arr_1,std::pair<int, int> sp_1,
double const *arr_2, std::pair<int, int> sp_2,
double *arr_3, std::pair<int, int> sp_3,
int base_row,int base_col){
double sum = 0;
for (int i = 0; i < sp_1.columns; i++){
int idx_1 = base_row * sp_1.columns + i;
int idx_2 = i * sp_2.columns + base_col;
sum += arr_1[idx_1] * arr_2[idx_2];
}
int idx_3 = base_row * sp_3.columns + base_col;
arr_3[idx_3] = sum;
}
auto matmul(double *m1, std::pair<int, int> sp_1, double *m2, std::pair<int, int> sp_2){
// "sp_n" stands for shape for n-th matrix
if (sp_1.second == sp_2.first){
auto *m3 = (double *) malloc(sp_1.first*sp_2.second* sizeof(double));
std::pair sp_3 = {sp_1.first, sp_2.second};
for (int k = 0; k < sp_3.rows; k++){
std::vector<std::thread> thread_list(sp_2.columns);
for (int j = 0; j < sp_2.columns; j++){
// will automatically save linear combination sum into m3
thread_list[j] = ( std::thread(linear_combination,
m1, sp_1,
m2, sp_2,
m3, sp_3,
k, j) );
}
// join threads and use calculation
std::for_each(thread_list.begin(), thread_list.end(), std::mem_fn(&std::thread::join));
}
return std::make_tuple(m3, sp_3);
} else{
puts("Size mismatch");
printf("%d %d\n", sp_1.second, sp_2.first);
double m3 = 0;
return std::make_tuple(&m3, std::make_pair(0, 0));
}
}