Question

我正在尝试并行化一个名为streamcluster的程序。更具体地说，名为pgain的函数根据我使用的Scalasca工具花费程序的大部分时间，所以这是我应该并行化的函数。在这里，您可以看到功能和我在并行化方面的努力。问题是，我唯一能做到的就是花更多时间执行的程序。

streamcluster中的原始pgain函数：

double pgain ( long x, Points *points, double z, long int *numcenters )
{
int i;
int number_of_centers_to_close = 0;

static double *work_mem;
static double gl_cost_of_opening_x;
static int gl_number_of_centers_to_close;

int stride = *numcenters + 2;
//make stride a multiple of CACHE_LINE
int cl = CACHE_LINE/sizeof ( double );
if ( stride % cl != 0 ) {
    stride = cl * ( stride / cl + 1 );
}
int K = stride - 2 ; // K==*numcenters

//my own cost of opening x
double cost_of_opening_x = 0;

work_mem = ( double* ) malloc ( 2 * stride * sizeof ( double ) );
gl_cost_of_opening_x = 0;
gl_number_of_centers_to_close = 0;

/*
 * For each center, we have a *lower* field that indicates
 * how much we will save by closing the center.
 */
int count = 0;
for ( int i = 0; i < points->num; i++ ) {
    if ( is_center[i] ) {
        center_table[i] = count++;
    }
}
work_mem[0] = 0;

//now we finish building the table. clear the working memory.
memset ( switch_membership, 0, points->num * sizeof ( bool ) );
memset ( work_mem, 0, stride*sizeof ( double ) );
memset ( work_mem+stride,0,stride*sizeof ( double ) );

//my *lower* fields
double* lower = &work_mem[0];
//global *lower* fields
double* gl_lower = &work_mem[stride];

for ( i = 0; i < points->num; i++ ) {
    float x_cost = dist ( points->p[i], points->p[x], points->dim ) * points->p[i].weight;
    float current_cost = points->p[i].cost;

    if ( x_cost < current_cost ) {

        // point i would save cost just by switching to x
        // (note that i cannot be a median,
        // or else dist(p[i], p[x]) would be 0)

        switch_membership[i] = 1;
        cost_of_opening_x += x_cost - current_cost;

    } else {

        // cost of assigning i to x is at least current assignment cost of i

        // consider the savings that i's **current** median would realize
        // if we reassigned that median and all its members to x;
        // note we've already accounted for the fact that the median
        // would save z by closing; now we have to subtract from the savings
        // the extra cost of reassigning that median and its members
        int assign = points->p[i].assign;
        lower[center_table[assign]] += current_cost - x_cost;
    }
}

// at this time, we can calculate the cost of opening a center
// at x; if it is negative, we'll go through with opening it

for ( int i = 0; i < points->num; i++ ) {
    if ( is_center[i] ) {
        double low = z + work_mem[center_table[i]];
        gl_lower[center_table[i]] = low;
        if ( low > 0 ) {
            // i is a median, and
            // if we were to open x (which we still may not) we'd close i

            // note, we'll ignore the following quantity unless we do open x
            ++number_of_centers_to_close;
            cost_of_opening_x -= low;
        }
    }
}
//use the rest of working memory to store the following
work_mem[K] = number_of_centers_to_close;
work_mem[K+1] = cost_of_opening_x;

gl_number_of_centers_to_close = ( int ) work_mem[K];
gl_cost_of_opening_x = z + work_mem[K+1];

// Now, check whether opening x would save cost; if so, do it, and
// otherwise do nothing

if ( gl_cost_of_opening_x < 0 ) {
    //  we'd save money by opening x; we'll do it
    for ( int i = 0; i < points->num; i++ ) {
        bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;
        if ( switch_membership[i] || close_center ) {
            // Either i's median (which may be i itself) is closing,
            // or i is closer to x than to its current median
            points->p[i].cost = points->p[i].weight * dist ( points->p[i], points->p[x], points->dim );
            points->p[i].assign = x;
        }
    }
    for ( int i = 0; i < points->num; i++ ) {
        if ( is_center[i] && gl_lower[center_table[i]] > 0 ) {
            is_center[i] = false;
        }
    }
    if ( x >= 0 && x < points->num ) {
        is_center[x] = true;
    }

    *numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
} else {
    gl_cost_of_opening_x = 0;  // the value we'll return
}

free ( work_mem );

return -gl_cost_of_opening_x;
}

这就是我所做的并行化：

double pgain ( long x, Points *points, double z, long int *numcenters )
{
int i;
int number_of_centers_to_close = 0;

static double *work_mem;
static double gl_cost_of_opening_x;
static int gl_number_of_centers_to_close;

int stride = *numcenters + 2;
//make stride a multiple of CACHE_LINE
int cl = CACHE_LINE/sizeof ( double );
if ( stride % cl != 0 ) {
    stride = cl * ( stride / cl + 1 );
}
int K = stride - 2 ; // K==*numcenters

//my own cost of opening x
double cost_of_opening_x = 0;

work_mem = ( double* ) malloc ( 2 * stride * sizeof ( double ) );
gl_cost_of_opening_x = 0;
gl_number_of_centers_to_close = 0;

/*
 * For each center, we have a *lower* field that indicates
 * how much we will save by closing the center.
 */
int count = 0;
for ( int i = 0; i < points->num; i++ ) {
    if ( is_center[i] ) {
        center_table[i] = count++;
    }
}
work_mem[0] = 0;

//now we finish building the table. clear the working memory.
memset ( switch_membership, 0, points->num * sizeof ( bool ) );
memset ( work_mem, 0, stride*sizeof ( double ) );
memset ( work_mem+stride,0,stride*sizeof ( double ) );

//my *lower* fields
double* lower = &work_mem[0];
//global *lower* fields
double* gl_lower = &work_mem[stride];
float x_cost=0.0;
float current_cost=0.0;

#pragma omp parallel for private(current_cost,x_cost) 
shared(cost_of_opening_x)  


for ( i = 0; i < points->num; i++ ) {

    x_cost = dist ( points->p[i], points->p[x], points->dim ) * points->p[i].weight;
    current_cost = points->p[i].cost;

    if ( x_cost < current_cost ) {

        // point i would save cost just by switching to             // x
        // (note that i cannot be a median,
        // or else dist(p[i], p[x]) would be 0)

        switch_membership[i] = 1;

        cost_of_opening_x += x_cost - current_cost;
        {
        #pragma omp flush(cost_of_opening_x)
        }
    } else {

        // cost of assigning i to x is at least current assignment cost of i

        // consider the savings that i's **current** median would realize
        // if we reassigned that median and all its members to x;
        // note we've already accounted for the fact that the median
        // would save z by closing; now we have to subtract from the savings
        // the extra cost of reassigning that median and its members
        int assign = points->p[i].assign;

        lower[center_table[assign]] += current_cost - x_cost;
        {
        #pragma omp flush(lower) 
        }
    }
#pragma omp barrier
{
#pragma omp flush(lower,cost_of_opening_x)
}   
}

// at this time, we can calculate the cost of opening a center
// at x; if it is negative, we'll go through with opening it
for ( int i = 0; i < points->num; i++ ) {
    if ( is_center[i] ) {
        double low = z + work_mem[center_table[i]];
        gl_lower[center_table[i]] = low;
        if ( low > 0 ) {
            // i is a median, and
            // if we were to open x (which we still may not) we'd close i

            // note, we'll ignore the following quantity unless we do open x
            ++number_of_centers_to_close;
            cost_of_opening_x -= low;
        }
    }
}
//use the rest of working memory to store the following
work_mem[K] = number_of_centers_to_close;
work_mem[K+1] = cost_of_opening_x;

gl_number_of_centers_to_close = ( int ) work_mem[K];
gl_cost_of_opening_x = z + work_mem[K+1];

// Now, check whether opening x would save cost; if so, do it, and
// otherwise do nothing

if ( gl_cost_of_opening_x < 0 ) {
    //  we'd save money by opening x; we'll do it
    #pragma omp parallel for
    for ( int i = 0; i < points->num; i++ ) {
        bool close_center = gl_lower[center_table[points->p[i].assign]] > 0 
;
        if ( switch_membership[i] || close_center ) {
            // Either i's median (which may be i itself) is closing,
            // or i is closer to x than to its current median
            points->p[i].cost = points->p[i].weight * dist ( points->p[i], 
points->p[x], points->dim );
            points->p[i].assign = x;
        }
    }
    for ( int i = 0; i < points->num; i++ ) {
        if ( is_center[i] && gl_lower[center_table[i]] > 0 ) {
            is_center[i] = false;
        }
    }
    if ( x >= 0 && x < points->num ) {
        is_center[x] = true;
    }

    *numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
} else {
    gl_cost_of_opening_x = 0;  // the value we'll return
}

free ( work_mem );

return -gl_cost_of_opening_x;
}

你能看到任何可以加快速度的改进或改变吗？提前谢谢。

Answer 1

我没有考虑你的代码逻辑（既不是串行也不是并行）。但我试图找出一些可以并行化的片段。我没有编译我建议的代码。因此，我的回答是指出一些可能性使您的代码更快。当然，必须验证，共享和私人变量必须进行分析等。基于此，我的建议是：

double pgain ( long x, Points *points, double z, long int *numcenters )
{
    int i;
    int number_of_centers_to_close = 0;

    static double *work_mem;
    static double gl_cost_of_opening_x;
    static int gl_number_of_centers_to_close;

    int stride = *numcenters + 2;
    //make stride a multiple of CACHE_LINE
    int cl = CACHE_LINE/sizeof ( double );
    if ( stride % cl != 0 ) {
        stride = cl * ( stride / cl + 1 );
    }
    int K = stride - 2 ; // K==*numcenters

    //my own cost of opening x
    double cost_of_opening_x = 0;
    work_mem = ( double* ) malloc ( 2 * stride * sizeof ( double ) );
    gl_cost_of_opening_x = 0;
    gl_number_of_centers_to_close = 0;

    int count = 0;
    //my *lower* fields
    double* lower;
    //global *lower* fields
    double* gl_lower;

    #pragma omp parallel
    {
        /*
         * For each center, we have a *lower* field that indicates
         * how much we will save by closing the center.
         */

        int i;
        #pragma omp for private(i)
        for ( i = 0; i < points->num; i++ ) {
            if ( is_center[i] ) {
                #pragma omp critical
                center_table[i] = count++;
            }
        }

        #pragma omp single
        work_mem[0] = 0;

        #pragma omp sections
        {
            //now we finish building the table. clear the working memory.
            #pragma omp section
            memset ( switch_membership, 0, points->num * sizeof ( bool ) );

            #pragma omp section
            memset ( work_mem, 0, stride*sizeof ( double ) );

            #pragma omp section
            memset ( work_mem+stride,0,stride*sizeof ( double ) );
        }

        #pragma omp single
        {
            lower = &work_mem[0];
            gl_lower = &work_mem[stride];
        }

        float x_cost, current_cost;

        #pragma omp for private(i, x_cost, current_cost)
        for ( i = 0; i < points->num; i++ ) {

            x_cost = dist ( points->p[i], points->p[x], points->dim ) * points->p[i].weight;
            current_cost = points->p[i].cost;

            if ( x_cost < current_cost ) {

                // point i would save cost just by switching to x
                // (note that i cannot be a median,
                // or else dist(p[i], p[x]) would be 0)

                switch_membership[i] = 1;

                #pragma omp critical
                cost_of_opening_x += x_cost - current_cost;

            } else {

                // cost of assigning i to x is at least current assignment cost of i

                // consider the savings that i's **current** median would realize
                // if we reassigned that median and all its members to x;
                // note we've already accounted for the fact that the median
                // would save z by closing; now we have to subtract from the savings
                // the extra cost of reassigning that median and its members
                int assign = points->p[i].assign;

                #pragma omp critical
                lower[center_table[assign]] += current_cost - x_cost;
            }
        }

        // at this time, we can calculate the cost of opening a center
        // at x; if it is negative, we'll go through with opening it

        double low;

        #pragma omp for private(i, low)
        for ( int i = 0; i < points->num; i++ ) {
            if ( is_center[i] ) {
                low = z + work_mem[center_table[i]];
                #pragma omp critical
                gl_lower[center_table[i]] = low;
                if ( low > 0 ) {
                    // i is a median, and
                    // if we were to open x (which we still may not) we'd close i

                    // note, we'll ignore the following quantity unless we do open x
                    #pragma omp atomic
                    ++number_of_centers_to_close;
                    #pragma omp critical
                    cost_of_opening_x -= low;
                }
            }
        }

        #pragma omp sections
        {
            //use the rest of working memory to store the following
            #pragma omp section
            work_mem[K] = number_of_centers_to_close;

            #pragma omp section
            work_mem[K+1] = cost_of_opening_x;

            #pragma omp section
            gl_number_of_centers_to_close = ( int ) work_mem[K];

            #pragma omp section
            gl_cost_of_opening_x = z + work_mem[K+1];
        }

        // Now, check whether opening x would save cost; if so, do it, and
        // otherwise do nothing
        bool close_center;

        if ( gl_cost_of_opening_x < 0 ) {
            //  we'd save money by opening x; we'll do it
            #pragma omp for private(i)
            for ( i = 0; i < points->num; i++ ) {
                close_center = gl_lower[center_table[points->p[i].assign]] > 0 ;
                if ( switch_membership[i] || close_center ) {
                    // Either i's median (which may be i itself) is closing,
                    // or i is closer to x than to its current median
                    points->p[i].cost = points->p[i].weight * dist ( points->p[i], points->p[x], points->dim );
                    points->p[i].assign = x;
                }
            }
            #pragma omp for private(i)
            for ( i = 0; i < points->num; i++ ) {
                if ( is_center[i] && gl_lower[center_table[i]] > 0 ) {
                    is_center[i] = false;
                }
            }
            if ( x >= 0 && x < points->num ) {
                is_center[x] = true;
            }

            #pragma omp single
            *numcenters = *numcenters + 1 - gl_number_of_centers_to_close;
        } else {
            #pragma omp single
            gl_cost_of_opening_x = 0;  // the value we'll return
        }

        #pragma omp single
        free ( work_mem );
    }

    return -gl_cost_of_opening_x;
}

使用openmp并行化流集群

1 个答案: