我希望通过更改算法或使用pthread来使我的程序更快,但我无法弄清楚如何使用pthreads以及应用什么算法。谁能帮我吗?任何算法使矩阵的矩阵加法和更快找到最小值和最大值等? g_height是矩阵的行,g_width是矩阵的列。
/**
* Returns new matrix with scalar added to each element
*/
uint32_t* scalar_add(const uint32_t* matrix, uint32_t scalar) {
uint32_t* result = new_matrix();
/*
to do
1 0 2 1
0 1 + 1 => 1 2
1 2 5 6
3 4 + 4 => 7 8
*/
for (ssize_t y = 0; y < g_height; y++) {
for (ssize_t x = 0; x < g_width; x++) {
result[y * g_width + x]=matrix[y * g_width + x]+scalar;
}
}
return result;
}
/**
* Returns new matrix with scalar multiplied to each element
*/
uint32_t* scalar_mul(const uint32_t* matrix, uint32_t scalar) {
uint32_t* result = new_matrix();
/*
to do
1 0 2 0
0 1 x 2 => 0 2
1 2 2 4
3 4 x 2 => 6 8
*/
for (ssize_t y = 0; y < g_height; y++) {
for (ssize_t x = 0; x < g_width; x++) {
result[y * g_width + x]=matrix[y * g_width + x]*scalar;
}
}
return result;
}
/**
* Returns new matrix with elements added at the same index
*/
uint32_t* matrix_add(const uint32_t* matrix_a, const uint32_t* matrix_b) {
uint32_t* result = new_matrix();
/*
to do
1 0 0 1 1 1
0 1 + 1 0 => 1 1
1 2 4 4 5 6
3 4 + 4 4 => 7 8
*/
for (ssize_t y = 0; y < g_height; y++) {
for (ssize_t x = 0; x < g_width; x++){
result[y * g_width + x]=matrix_a[y * g_width + x]+matrix_b[y * g_width + x];
}
}
return result;
}
/**
* Returns new matrix, multiplying the two matrices together
*/
uint32_t* matrix_mul(const uint32_t* matrix_a, const uint32_t* matrix_b) {
uint32_t* result = new_matrix();
/*
to do
1 2 1 0 1 2
3 4 x 0 1 => 3 4
1 2 5 6 19 22
3 4 x 7 8 => 43 50
*/
uint32_t* tempmatrix_a=cloned(matrix_a);
for(ssize_t y=0; y<g_height; y++){
for(ssize_t x=0; x<g_width; x++){
for(int i=0; i<g_width; i++)
result[y*g_width + x]+=tempmatrix_a[y*g_width + i]*matrix_b[i*g_width + x ];
}
}
return result;
}
/**
* Returns new matrix, powering the matrix to the exponent
*/
uint32_t* matrix_pow(const uint32_t* matrix, uint32_t exponent) {
uint32_t* result = new_matrix();
/*
to do
1 2 1 0
3 4 ^ 0 => 0 1
1 2 1 2
3 4 ^ 1 => 3 4
1 2 199 290
3 4 ^ 4 => 435 634
*/
uint32_t* tempresult=identity_matrix();
ssize_t i;
if (exponent == 0)
result=identity_matrix();
for (i = 0; i < exponent; i++)
tempresult = matrix_mul(tempresult, matrix);
result=tempresult;
return result;
}
////////////////////////////////
/// COMPUTATIONS //
////////////////////////////////
/**
* Returns the sum of all elements
*/
uint32_t get_sum(const uint32_t* matrix) {
/*
to do
1 2
2 1 => 6
1 1
1 1 => 4
*/
int sum=0;
for(ssize_t y=0; y<g_height; y++){
for(ssize_t x=0; x<g_width; x++){
sum+=matrix[y*g_width + x];
}
}
return sum;
return 0;
}
/**
* Returns the trace of the matrix
*/
uint32_t get_trace(const uint32_t* matrix) {
/*
to do
1 0
0 1 => 2
2 1
1 2 => 4
*/
int trace=0;
for(ssize_t y=0; y<g_height; y++){
for(ssize_t x=0; x<g_width; x++){
if(y==x)
trace+=matrix[y*g_width + x];
}
}
return trace;
return 0;
}
/**
* Returns the smallest value in the matrix
*/
uint32_t get_minimum(const uint32_t* matrix) {
/*
to do
1 2
3 4 => 1
4 3
2 1 => 1
*/
int min=matrix[0];
for(ssize_t y=0; y<g_height; y++){
for(ssize_t x=0; x<g_width; x++){
if(min>matrix[y*g_width + x])
min=matrix[y*g_width + x];
}
}
return min;
return 0;
}
/**
* Returns the largest value in the matrix
*/
uint32_t get_maximum(const uint32_t* matrix) {
/*
to do
1 2
3 4 => 4
4 3
2 1 => 4
*/
int max=matrix[0];
for(ssize_t y=0; y<g_height; y++){
for(ssize_t x=0; x<g_width; x++){
if(max<matrix[y*g_width + x])
max=matrix[y*g_width + x];
}
}
return max;
return 0;
}
/**
* Returns the frequency of the value in the matrix
*/
uint32_t get_frequency(const uint32_t* matrix, uint32_t value) {
/*
to do
1 1
1 1 :: 1 => 4
1 0
0 1 :: 2 => 0
*/
int frequency=0;
for(ssize_t y=0; y<g_height; y++){
for(ssize_t x=0; x<g_width; x++){
if(matrix[y*g_width + x]==value)
frequency++;
}
}
return frequency;
return 0;
}
答案 0 :(得分:2)
三个不同的方向:
您可以在this Wikipedia section中找到矩阵乘法如何自然地分解为可以并行化的块。另外,请参阅下面的Codor's优点,了解更多适合缓存的版本。在你彻底了解为什么之前的选项不适合你之前,我不建议在实践中尝试这个。
矩阵乘法由许多标量积组成。不是通过块而是通过这些微小的产品进行并行化通常更有效。再一次,你最安全的选择是让一些图书馆(如第一点)为你做这件事。实际上从这些事情中获得加速是非常棘手的。