Question

我希望通过更改算法或使用pthread来使我的程序更快，但我无法弄清楚如何使用pthreads以及应用什么算法。谁能帮我吗？任何算法使矩阵的矩阵加法和更快找到最小值和最大值等？ g_height是矩阵的行，g_width是矩阵的列。

/**
 * Returns new matrix with scalar added to each element
 */
uint32_t* scalar_add(const uint32_t* matrix, uint32_t scalar) {

    uint32_t* result = new_matrix();

    /*
        to do

        1 0        2 1
        0 1 + 1 => 1 2

        1 2        5 6
        3 4 + 4 => 7 8
    */
    for (ssize_t y = 0; y < g_height; y++) {
        for (ssize_t x = 0; x < g_width; x++) {
            result[y * g_width + x]=matrix[y * g_width + x]+scalar;
        }
    }
    return result;
}

/**
 * Returns new matrix with scalar multiplied to each element
 */
uint32_t* scalar_mul(const uint32_t* matrix, uint32_t scalar) {

    uint32_t* result = new_matrix();

    /*
        to do

        1 0        2 0
        0 1 x 2 => 0 2

        1 2        2 4
        3 4 x 2 => 6 8
    */
    for (ssize_t y = 0; y < g_height; y++) {
        for (ssize_t x = 0; x < g_width; x++) {
                result[y * g_width + x]=matrix[y * g_width + x]*scalar;
        }
    }
    return result;
}

/**
 * Returns new matrix with elements added at the same index
 */
uint32_t* matrix_add(const uint32_t* matrix_a, const uint32_t* matrix_b) {

    uint32_t* result = new_matrix();

    /*
        to do

        1 0   0 1    1 1
        0 1 + 1 0 => 1 1

        1 2   4 4    5 6
        3 4 + 4 4 => 7 8
    */
    for (ssize_t y = 0; y < g_height; y++) {
        for (ssize_t x = 0; x < g_width; x++){
                result[y * g_width + x]=matrix_a[y * g_width + x]+matrix_b[y * g_width + x];
        }    
    }
    return result;
}

/**
 * Returns new matrix, multiplying the two matrices together
 */
uint32_t* matrix_mul(const uint32_t* matrix_a, const uint32_t* matrix_b) {

    uint32_t* result = new_matrix();

    /*
        to do

        1 2   1 0    1 2
        3 4 x 0 1 => 3 4

        1 2   5 6    19 22
        3 4 x 7 8 => 43 50
    */
    uint32_t* tempmatrix_a=cloned(matrix_a);
    for(ssize_t y=0; y<g_height; y++){
        for(ssize_t x=0; x<g_width; x++){
            for(int i=0; i<g_width; i++)
                result[y*g_width + x]+=tempmatrix_a[y*g_width + i]*matrix_b[i*g_width + x ];
        }
    }
    return result;
}

/**
 * Returns new matrix, powering the matrix to the exponent
 */
uint32_t* matrix_pow(const uint32_t* matrix, uint32_t exponent) {

    uint32_t* result = new_matrix();

    /*
        to do

        1 2        1 0
        3 4 ^ 0 => 0 1

        1 2        1 2
        3 4 ^ 1 => 3 4

        1 2        199 290
        3 4 ^ 4 => 435 634
    */
    uint32_t* tempresult=identity_matrix();
    ssize_t i;
    if (exponent == 0)
    result=identity_matrix();
    for (i = 0; i < exponent; i++) 
        tempresult = matrix_mul(tempresult, matrix);
        result=tempresult;
    return result;
}

////////////////////////////////
///       COMPUTATIONS       //
////////////////////////////////

/**
 * Returns the sum of all elements
 */
uint32_t get_sum(const uint32_t* matrix) {

    /*
        to do

        1 2
        2 1 => 6

        1 1
        1 1 => 4
    */
    int sum=0;
    for(ssize_t y=0; y<g_height; y++){
        for(ssize_t x=0; x<g_width; x++){
            sum+=matrix[y*g_width + x];
        }
    }
    return sum;
    return 0;
}

/**
 * Returns the trace of the matrix
 */
uint32_t get_trace(const uint32_t* matrix) {

    /*
        to do

        1 0
        0 1 => 2

        2 1
        1 2 => 4
    */
    int trace=0;
    for(ssize_t y=0; y<g_height; y++){
        for(ssize_t x=0; x<g_width; x++){
            if(y==x)
            trace+=matrix[y*g_width + x];
        }
    }
    return trace;
    return 0;
}

/**
 * Returns the smallest value in the matrix
 */
uint32_t get_minimum(const uint32_t* matrix) {

    /*
        to do

        1 2
        3 4 => 1

        4 3
        2 1 => 1
    */
    int min=matrix[0];
    for(ssize_t y=0; y<g_height; y++){
        for(ssize_t x=0; x<g_width; x++){
            if(min>matrix[y*g_width + x])
                min=matrix[y*g_width + x];
        }
    }
    return min;
    return 0;
}

/**
 * Returns the largest value in the matrix
 */
uint32_t get_maximum(const uint32_t* matrix) {

    /*
        to do

        1 2
        3 4 => 4

        4 3
        2 1 => 4
    */
    int max=matrix[0];
    for(ssize_t y=0; y<g_height; y++){
        for(ssize_t x=0; x<g_width; x++){
            if(max<matrix[y*g_width + x])
                max=matrix[y*g_width + x];
        }
    }
    return max;
    return 0;
}

/**
 * Returns the frequency of the value in the matrix
 */
uint32_t get_frequency(const uint32_t* matrix, uint32_t value) {

    /*
        to do

        1 1
        1 1 :: 1 => 4

        1 0
        0 1 :: 2 => 0
    */
    int frequency=0;
    for(ssize_t y=0; y<g_height; y++){
        for(ssize_t x=0; x<g_width; x++){
            if(matrix[y*g_width + x]==value)
                frequency++;
        }
    }
    return frequency;
    return 0;
}

Answer 1

三个不同的方向：

在实践中，加速乘法的最佳选择是使用某些库，例如MKL或Atlas
您可以在this Wikipedia section中找到矩阵乘法如何自然地分解为可以并行化的块。另外，请参阅下面的Codor's优点，了解更多适合缓存的版本。在你彻底了解为什么之前的选项不适合你之前，我不建议在实践中尝试这个。
矩阵乘法由许多标量积组成。不是通过块而是通过这些微小的产品进行并行化通常更有效。再一次，你最安全的选择是让一些图书馆（如第一点）为你做这件事。实际上从这些事情中获得加速是非常棘手的。

在c中更快地进行矩阵计算

1 个答案: