我需要将C代码与OpenMP并行化。我做到了,就像下面的代码。该代码的性能从0.001XXXs降为1.XXXs,因此存在竞争条件(瓶颈)。在代码中,我已注释了它们的位置(在MAYUS中进行了注释,没有列表化,以实现更好的可视化//BOTTLENECK
)。我不知道为什么这种护理条件。
代码是
#define sqr(x) ((x)*(x))
#define MAX_ITER_NO_IMPR 10
void fail(const char * str) {
fprintf(stderr,"%s", str);
exit(-1);
}
/**
* calc_distance calculates the distance between a given point and a cluster
* @param int -dim: number of columns (variables) in the data set to be classified
* @param float * -: first arrray to calculate de distance
* @param float * -: Second array to calculate de distance
* @return float: Euclidean distance of two vectors
*/
float calc_distance(int dim, float *restrict p1, float *restrict p2) {
float distance_sq_sum = 0;
#pragma omp parallel for reduction(+:distance_sq_sum)
for (int i = 0; i < dim; ++i)
distance_sq_sum += sqr(p1[i] - p2[i]);
return distance_sq_sum;
}
/**
* calc_all_distances computes the euclidean distances between centros ids and dataset points.
* @param int -dim: number of columns (variables) in the data set to be classified
* @param int -n: number of rows (points) in the data set to be classified
* @param int -k: number of clusters to be calculated
* @param float * -X: dataset to be classified
* @param float * -centroid: prototypes of each cluster.
* @param float * -distance_output[n][k] contains the distance between all elements * in the dataset and all clusters
* return void
*/
void calc_all_distances(int dim, int n, int k, float *restrict X, float *restrict centroid, float *distance_output) {
#pragma omp parallel for simd
for (int i = 0; i < n; ++i) // for each point
for (int j = 0; j < k; ++j) // for each cluster
// calculate distance between point and cluster centroid
distance_output[i*k+j] = calc_distance(dim, &X[i*dim], ¢roid[j*dim]);
}
/**
* calc_total_distance calculates the clustering overall distance.
* @param int -dim: number of columns (variables) in the data set to be classified
* @param int -n: number of rows (points) in the data set to be classified
* @param int -k: number of clusters to be calculated
* @param float * -X: dataset to be classified
* @param float * -centroid: prototypes of each cluster.
* @param int * - cluster_assignment_index: current cluster assignment to each point
* @return float overall distance. This is what the algorithm tried to minimize
*/
float calc_total_distance(int dim, int n, int k, float *restrict X, float *restrict centroids, int *restrict cluster_assignment_index) {
// NOTE: a point with cluster assignment -1 is ignored
float tot_D = 0;
// for every point
#pragma omp parallel for simd reduction(+:tot_D)
//BOTTLENECK
for (int i = 0; i < n; ++i) {
// which cluster is it in?
int active_cluster = cluster_assignment_index[i];
// sum distance
if (active_cluster != -1)
tot_D += calc_distance(dim, &X[i*dim], ¢roids[active_cluster*dim]);
}
return tot_D;
}
/**
* choose_all_clusters_from_distances obtains the closest cluster for each point.
* @param int -dim: number of columns (variables) in the data set to be classified
* @param int -n: number of rows (points) in the data set to be classified
* @param int -k: number of clusters to be calculated
* @param float * -distance_array[n][k] contains the distance between all elements * in the dataset and all clusters
* @param int* - cluster_assignment_index contains the assigned cluster to each point
* @return void
*/
void choose_all_clusters_from_distances(int dim, int n, int k, float *restrict distance_array, int *cluster_assignment_index) {
// for each point
#pragma omp parallel for simd
for (int i = 0; i < n; ++i) {
int best_index = -1;
float closest_distance = INFINITY;
// for each cluster
// #pragma omp privete(best_index, closest_distance)
for (int j = 0; j < k; ++j) {
// distance between point and cluster centroid
float cur_distance = distance_array[i*k+j];
if (cur_distance < closest_distance) {
best_index = j;
closest_distance = cur_distance;
}
}
// record in array
cluster_assignment_index[i] = best_index;
}
}
/**
* calc_cluster_centroids calculates the new prototypes of all clusters
* @param int -dim: number of columns (variables) in the data set to be classified
* @param int -n: number of rows (points) in the data set to be classified
* @param int -k: number of clusters to be calculated
* @param float * -X: dataset to be classified
* @param int * - cluster_assigment_index:
* @param float * -new_cluster_centroid: it is the output with the new cluster prototypes
*/
void calc_cluster_centroids(int dim, int n, int k, float *restrict X, int *restrict cluster_assignment_index, float *new_cluster_centroid) {
int * cluster_member_count = (int *) calloc (k,sizeof(float));
// sum all points
// for every point
#pragma omp parallel for simd
//BOTTLENECK
for (int i = 0; i < n; ++i) {
// which cluster is it in?
int active_cluster = cluster_assignment_index[i];
// update count of members in that cluster
++cluster_member_count[active_cluster];
// sum point coordinates for finding centroid
for (int j = 0; j < dim; ++j)
new_cluster_centroid[active_cluster*dim + j] += X[i*dim + j];
}
// USAR CONCELLATION POINT FOR O LA OTRA FORMA COMENTADA?
// now divide each coordinate sum by number of members to find mean/centroid
// for each cluster
#pragma omp for
for (int i = 0; i < k; ++i) {
if (cluster_member_count[i] == 0) {
//printf("WARNING: Empty cluster %d! \n", i);
//break;
#pragma omp cancel for
}
#pragma omp cancellation point for
// for each dimension
#pragma omp simd
for (int j = 0; j < dim; ++j)
new_cluster_centroid[i*dim + j] /= cluster_member_count[i]; /// XXXX will divide by zero here for any empty clusters!
}
}
/**
* get_cluster_member_count the member of each cluster
* @param int -n: number of rows (points) in the data set to be classified
* @param int -k: number of clusters to be calculated
* @param int* - cluster_assignment_index contains the assigned cluster to each point
* @param int * -cluster_member_count: count members of each cluster
*/
void get_cluster_member_count(int n, int k, int *restrict cluster_assignment_index, int *cluster_member_count) {
// count members of each cluster
#pragma omp parallel for
for (int i = 0; i < n; ++i)
#pragma omp atomic update
++cluster_member_count[cluster_assignment_index[i]];
}
/**
* Visualize the number of members for all clusters
*/
void cluster_diag(int dim, int n, int k, float *restrict X, int *restrict cluster_assignment_index, float *restrict cluster_centroid) {
int * cluster_member_count = (int *) calloc (k, sizeof(int));
get_cluster_member_count(n, k, cluster_assignment_index, cluster_member_count);
printf(" Final clusters\n");
#pragma omp parallel for ordered
for (int i = 0; i < k; ++i) {
#pragma omp ordered
printf("\tcluster %d: members: %8d, for the centroid (", i, cluster_member_count[i]);
for (int j = 0; j < dim; ++j)
#pragma omp ordered
printf ("%f, ", cluster_centroid[i*dim + j]);
#pragma omp ordered
printf (")\n");
}
}
void copy_assignment_array(int n, int *restrict src, int *tgt) {
#pragma omp parallel for simd
for (int i = 0; i < n; ++i)
tgt[i] = src[i];
}
int assignment_change_count(int n, int a[], int b[]) {
int change_count = 0;
#pragma omp parallel for reduction(+:change_count)
for (int i = 0; i < n; ++i)
if (a[i] != b[i])
++change_count;
return change_count;
}
/*
* This is C source code for a simple implementation of the popular k-means clustering algorithm.
* It is based on the implementation in Matlab, which was in turn based on GAF Seber,
* Multivariate Observations, 1964, and H Spath, Cluster Dissection and Analysis: Theory, FORTRAN Programs, Examples.
* @param int -dim: number of columns (variables) in the data set to be classified (dimension of data)
* @param float * -X: dataset to be classified (pointer to data)
* @param int -n: number of rows (points) in the data set to be classified (number of elements)
* @param int -k: number of clusters to be calculated
* @param float * -cluster_centroid: Initial clusters prototypes or centros (initial cluster centroids)
* @param int iterations -: number of iterations to be performed
* @param int * cluster_assignment_final -: Output classitfication
*/
void kmeans(int dim, float *X, int n, int k, float *cluster_centroid, int iterations, int *cluster_assignment_final) {
int floatPointerSize = n * k * sizeof(float);
int intPointerSize = n * sizeof(int);
float *dist = (float *) malloc( floatPointerSize );
int *cluster_assignment_cur = (int *) malloc( intPointerSize );
int *cluster_assignment_prev = (int *) malloc( intPointerSize );
float *point_move_score = (float *) malloc( floatPointerSize );
if (!dist || !cluster_assignment_cur || !cluster_assignment_prev || !point_move_score)
fail("Error allocating dist arrays\n");
// Initial setup. Assignment Step
calc_all_distances(dim, n, k, X, cluster_centroid, dist);
choose_all_clusters_from_distances(dim, n, k, dist, cluster_assignment_cur);
copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_prev);
//The initial quality is the one obtained from the random election
float prev_totD = calc_total_distance(dim, n, k, X, cluster_centroid, cluster_assignment_cur);
int numVariations = 0;
// UPDATE STEP
// for (int batch=0; (batch < iterations) && (numVariations <MAX_ITER_NO_IMPR); ++batch) {
for (int batch = 0; batch < iterations; ++batch) {
//printf("Batch step: %d \n", batch);
//cluster_diag(dim, n, k, X, cluster_assignment_cur, cluster_centroid);
// update cluster centroids. Update Step
calc_cluster_centroids(dim, n, k, X, cluster_assignment_cur, cluster_centroid);
float totD = calc_total_distance(dim, n, k, X, cluster_centroid, cluster_assignment_cur);
// see if we've failed to improve
if (totD >= prev_totD){
// failed to improve - currently solution worse than previous
// restore old assignments
copy_assignment_array(n, cluster_assignment_prev, cluster_assignment_cur);
// recalc centroids
// calc_cluster_centroids(dim, n, k, X, cluster_assignment_cur, cluster_centroid);
//printf("\tNegative progress made on this step - iteration completed (%.2f) \n", prev_totD-totD);
++numVariations; //To implement no convergence criteria
}
else { // We have made some improvements
// save previous step
copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_prev);
// move all points to nearest cluster
calc_all_distances(dim, n, k, X, cluster_centroid, dist);
choose_all_clusters_from_distances(dim, n, k, dist, cluster_assignment_cur);
//check how many assignments are different
//int change_count = assignment_change_count(n, cluster_assignment_cur, cluster_assignment_prev);
//printf("\tIn the batch: %d, has changed: %d element to a different cluster with an improvement of %f \n", batch, change_count, prev_totD-totD);
//fflush(stdout);
prev_totD = totD;
}
}
//
cluster_diag(dim, n, k, X, cluster_assignment_cur, cluster_centroid);
// write to output array
copy_assignment_array(n, cluster_assignment_cur, cluster_assignment_final);
//Free memory
free(dist);
free(cluster_assignment_cur);
free(cluster_assignment_prev);
free(point_move_score);
}
/**
* random_init_centroid chooses random prototypes that belong to the dataset. They are points of the dataset.
*@param float * -: cluster_centro_if: clustes id choosen
*@param float * -: dataSetMatrix
*@param int clusters: Number of cluster to be don.
*@param int rows in number of rows in the dataset; i.e. points
*@param int columns: number of columns. Point's dimension.
*@return void
*/
void random_init_centroid (float * cluster_centro_id, float * dataSetMatrix, int clusters, int rows, int columns) {
srand(time(NULL));
for (int i=0; i<clusters; ++i) {
int r = rand()%rows;
for (int j=0; j<columns;++j) {
cluster_centro_id[i*columns+j]=dataSetMatrix[r*columns+j];
//printf ("Los indices son %d\n", r*columns+j);
}
}
}
int main( int argc, char *argv[] ) {
/**/
if( !omp_get_cancellation() )
{
//printf("Cancellations were not enabled, enabling cancellation and rerunning program\n");
putenv("OMP_CANCELLATION=true");
execv(argv[0], argv);
}
int numHilos = 0;
#pragma omp parallel
{
#pragma omp master
numHilos = omp_get_num_threads();
}
if (numHilos == 1) {
//printf("Program is executing sequentially, setting 2 threads and rerunning program\n");
putenv("OMP_NUM_THREADS=2");
execv(argv[0], argv);
}
/**/
float *cluster_centroid; // initial cluster centroids. The size is Clusters x rows
int *clustering_output; // output
int rows=0, columns=0, clusters=1;
int iterations = 1000;
float * dataSetMatrix=NULL;
char c, *fileName=NULL;
//int err=system("clear");
while ((c = getopt (argc, argv, "v:c:f:i:h")) != -1) {
switch (c) {
case 'v':
printf("K means algorithm v.1.0\n\n");
return 0;
case 'c':
clusters = atoi(optarg);
if (clusters < 1) {
printf ("the minimum number of clusters is 1\n");
return 0;
}
break;
case 'f':
fileName = (char *) malloc (strlen(optarg)+1);
strcpy(fileName,optarg);
break;
case 'i':
iterations = atoi (optarg);
break;
case 'h':
case '?':
printf("Usage:\trun -c number of clusters -f fichero.txt -i number of iterations [-h | -? HELP] \n");
printf("\t<Params>\n");
printf("\t\t-v\t\tOutput version information and exit\n");
return 0;
}
}
//printf ("..............Loading data set...............\n ");
// Get file size dataset
getSizeFile( fileName, &rows, &columns );
clustering_output = (int *) malloc (rows*sizeof(int));
// Reserve dynamic memory for dataset matrix
reserveDynamicMemoryForMatrix( &dataSetMatrix, rows, columns );
// Set data in the dataset matrix
setDataInMatrix( dataSetMatrix, fileName, rows, columns );
//printf ("-------DataSet: \n");
//printMatrix(dataSetMatrix, rows, columns);
// printf ("..............Done..............\n ");
cluster_centroid = (float *) malloc (clusters*columns*sizeof(float));
random_init_centroid (cluster_centroid, dataSetMatrix, clusters, rows, columns);
//printf (".........Initial Prototypes: ................ \n");
//printMatrix(cluster_centroid, clusters, columns);
// COMENTAR ESTAS LÍNEA PARA NO MOSTRAR RESULTADOS
printf ("The number of instance: %d Variables: %d Clusters: %d and Iterations: %d\n", rows, columns,clusters, iterations);
// printf ("File: %d; \tClusters: %d; \tIterations: %d\n", filename, clusters, iterations);
//
double ini = omp_get_wtime();
kmeans (columns, dataSetMatrix, rows, clusters, cluster_centroid, iterations, clustering_output);
double fin = omp_get_wtime();
printf ("The execution time is %lf seconds\n", fin-ini);
// Free memory
free (dataSetMatrix);
free (cluster_centroid);
free (clustering_output);
}
谁知道为什么会出现这种瓶颈?我该怎么解决呢?
谢谢。
答案 0 :(得分:1)
1°/我没有看到很多矢量化的机会,因此simd
构造可能不会提高第一个循环的性能。可能在第二个。
2°/循环包含对共享变量cluster_member_count
和new_cluster_centroid
的写操作,应使用omp atomic
或omp critical
编译指示来保护这些操作免受竞争条件的影响。这将导致大多数循环的序列化。您需要确保两个线程不能处理属于同一群集的点。
有几种方法可以解决此问题。一种是先获取群集号,然后决定是否将其处理。
// make sure the code will compile even if openMP is disabled
#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#endif
#pragma omp parallel
{
// declare local variables
int active_cluster;
int myThread = omp_get_thread_num();
int nbOfThreads = omp_get_num_threads();
for (int i = 0; i < n; ++i) {
active_cluster = cluster_assignment_index[i];
if (active_cluster%nbOfThreads == myThread){
// update count of members in that cluster
++cluster_member_count[active_cluster];
// sum point coordinates for finding centroid
#pragma omp simd
for (int j = 0; j < dim; ++j)
new_cluster_centroid[active_cluster*dim + j] += X[i*dim + j];
}// end if
} //end parallel
有两个问题:
如果某些群集明显大于其他群集,则可能导致线程不平衡。可以通过在第一遍计算群集成员数,然后决定做一些负载平衡来解决。
另外,即使每个线程现在仅在没有其他人可以写入的位置写入数据,但该数据可能仍属于其他线程使用的缓存行,从而导致false sharing。例如,为了递增cluster_member_count[2]
,线程需要从内存中获取它,因为cluster_member_count 1刚被另一个线程修改。这很慢。
一种更好的方法是安排每个线程仅将数据写入连续的内存空间。
#pragma omp parallel
{
// declare local variables
int active_cluster;
int myThread = omp_get_thread_num();
int nbOfThreads = omp_get_num_threads();
int process_from=myThread*k/nbOfThreads;
int process_to = (myThread+1==nbOfThreads) ? k : (myThread+1)*k/nbOfThreads;
for (int i = 0; i < n; ++i) {
active_cluster = cluster_assignment_index[i];
if (active_cluster>=process_from && active_cluster<process_to ){
//SAME//
}
}
最后,虽然使用了更多的内存,但是“ map-reduce”方法很方便。它消耗了更多的内存d / t本地缓冲区,但显示了如何必须对算法进行调整才能并行化:
// add to header
#ifdef _OPENMP
#include <omp.h>
#else
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#endif
void calc_cluster_centroids(int dim, int n, int k, float *const X, int *const cluster_assignment_index, float *restrict new_cluster_centroid) {
int ** cluster_member_count;
float ** local_cluster_centroid;
float normFactor;
#pragma omp parallel
{
//declare variables
int active_cluster;
int myThread = omp_get_thread_num();
int nbOfThreads = omp_get_num_threads();
int number_count;
float normFactor;
// allocate memory
// this is done now as we need to be in the parallel region to know how many threads there are
// one thread will allocate the arrays of local buffers, the each thread allocates its own local buffers
#pragma omp single
{
cluster_member_count=malloc(nbOfThreads*sizeof(*cluster_member_count));
local_cluster_centroid=malloc(nbOfThreads*sizeof(*local_cluster_centroid));
}
// allocate local buffer for each thread
// for thread 0, local_cluster_centroid[0] contains new_cluster_centroid
cluster_member_count[myThread] = calloc(k,sizeof(int));
if (myThread) local_cluster_centroid[myThread] = calloc(k*dim,sizeof(float));
else local_cluster_centroid[myThread] = new_cluster_centroid;
//MAP : loop over points, increment count and accumumlate position in a local buffer
int *my_member_count=cluster_member_count[myThread];
float *my_cluster_centroid=local_cluster_centroid[myThread];
#pragma omp for
for (int i = 0; i < n; ++i) {
// each thread writes in its own buffer so there is no race condition or cache false sharing
active_cluster = cluster_assignment_index[i];
// update count of members in that cluster
++my_member_count[active_cluster];
// sum point coordinates for finding centroid
for (int j = 0; j < dim; ++j)
my_cluster_centroid[active_cluster*dim + j] += X[i*dim + j];
}
// REDUCE : loop over clusters and summ all local buffers
#pragma omp for schedule(static,64)
for (int i = 0; i < k; ++i) {
number_count=cluster_member_count[0][i];
for (int buff=1 ; buff<nbOfThreads ; ++buff){
number_count+=cluster_member_count[buff][i];
// sum point coordinates for finding centroid
for (int j = 0; j < dim; ++j)
local_cluster_centroid[0][i*dim + j]+=local_cluster_centroid[buff][i*dim + j];
}
normFactor=(number_count) ? 1/(float) number_count : 1.0f ;
#pragma omp simd
for (int j = i*dim; j<(i+1)*dim ; ++j)
cluster_member_count[0][j]*=normFactor;
}
// free memory
free(cluster_member_count[myThread]);
if (myThread) free(local_cluster_centroid[myThread]);
}//end parallel
//once each thread has freeed its own buffer, let the master free the array of buffers
free(cluster_member_count);
free(local_cluster_centroid);
}//end function