如何在OpenMP并行区域内声明和malloc指针? (错误:细分违规(生成“核心”))

时间:2018-11-21 12:27:07

标签: c openmp

我正在这样做:

void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

    int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

    #pragma omp parallel 
    {
        int ** localClusterMemberCount;
        int * activeCluster;
        #pragma omp single
        {
            localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
            //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
            for (int i = 0; i < omp_get_num_threads(); ++i) {
                localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
                //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
            }
            activeCluster = (int *) calloc (omp_get_num_threads(),sizeof(int));
        }

        // sum all points
        // for every point
        for (int i = 0; i < numObjs; ++i) {
            // which cluster is it in?
            activeCluster[omp_get_thread_num()] = clusterAssignmentCurrent[i];
            // update count of members in that cluster
            ++localClusterMemberCount[omp_get_thread_num()][activeCluster[omp_get_thread_num()]];
            // sum point coordinates for finding centroid
            for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
                clustersCentroID[activeCluster[omp_get_thread_num()]*numCoords + j] += dataSetMatrix[i*numCoords + j];
        }

        // now divide each coordinate sum by number of members to find mean/centroid
        // for each cluster
        for (int i = 0; i < numClusters; ++i) {
            if (localClusterMemberCount[omp_get_thread_num()][i] != 0)
                // for each numCoordsension
                for (int j = 0; j < numCoords; ++j)
#pragma omp atomic
                    clustersCentroID[i*numCoords + j] /= localClusterMemberCount[omp_get_thread_num()][i];  /// XXXX will divide by zero here for any empty clusters!
        }

        // free memory
        #pragma omp single
        {
            free (localClusterMemberCount[0]);
            free (localClusterMemberCount);
            free (activeCluster);
        }
    }
    free(clusterMemberCount);

但是我得到了错误:Segment violation ('core' generated),所以我做错了事,由于我尝试了顺序代码,所以我认为错误在于分配指针,并且工作正常。我也尝试过并行代码,但是没有mallocs(将globals变量与atomic一起使用),并且也可以正常工作。仅当我尝试创建私有指针并对其进行malloc分配时,错误才会出现。

任何想法我该如何解决?

1 个答案:

答案 0 :(得分:2)

发生段错误的两个原因:

  • localClusterMemberCount应该是在并行区域之外声明的共享变量,并由单个线程在并行区域内初始化。否则,每个线程都有其自己的变量副本,除经过single部分的线程外,所有线程均指向内存的随机位置。
  • 在释放指针的代码部分之前,需要隐式或显式屏障。必须先完成所有线程的操作,然后才能释放内存,否则一个线程可能会释放仍由其他线程使用的指针。

该代码几乎没有其他问题。请参阅以下带有我自己的注释并标记为***的评论:

void calculateClusterCentroIDs(int numCoords, int numObjs, int numClusters, float * dataSetMatrix, int * clusterAssignmentCurrent, float *clustersCentroID) {

    int * clusterMemberCount = (int *) calloc (numClusters,sizeof(int));

    /* ***
     * This has to be a shared variable that each thread can access
     * If declared inside the parallel region, it will be a thread-local variable
     * which is left un-initialized for all but one thread. Further attempts to access
     * that variable will lead to segfaults
     */
    int ** localClusterMemberCount;
    #pragma omp parallel shared(localClusterMemberCount,clusterMemberCount)
    {

        // *** Make activeCluster a thread-local variable rather than a shared array (shared array will result in false sharing)
        int activeCluster;
        #pragma omp single
        {
            localClusterMemberCount = (int **) malloc (omp_get_num_threads() * sizeof(int *));
            //localClusterMemberCount[0] = (int *) calloc (omp_get_num_threads()*numClusters,sizeof(int));
            for (int i = 0; i < omp_get_num_threads(); ++i) {
                localClusterMemberCount[i] = calloc (numClusters,sizeof(int));
                //localClusterMemberCount[i] = localClusterMemberCount[i-1] + numClusters;
            }
        }

        // sum all points
        // for every point
        for (int i = 0; i < numObjs; ++i) {
            // which cluster is it in?
            activeCluster = clusterAssignmentCurrent[i];
            // update count of members in that cluster
            ++localClusterMemberCount[omp_get_thread_num()][activeCluster];
            // sum point coordinates for finding centroid

            // *** This may be slower in parallel because of the atomic operation
            for (int j = 0; j < numCoords; ++j)
                #pragma omp atomic
                clustersCentroID[activeCluster*numCoords + j] += dataSetMatrix[i*numCoords + j];
        }

        /* ***
         * Missing: one reduction step
         * The global cluster member count needs to be updated
         * one option is below :
         */
         #pragma omp critical
         for (int i=0; i < numClusters; ++i) clusterMemberCount+=localClusterMemberCount[omp_get_thread_num()];
         #pragma omp barrier // wait here before moving on



        // *** The code below was wrong; to compute the average, coordinates should be divided by the global count
        // *** Sucessive divisions by local count will fail. Like, 1/(4+6) is not the same as (1/4)/6

        // now divide each coordinate sum by number of members to find mean/centroid
        // for each cluster
        #pragma omp for
        for (int i = 0; i < numClusters; ++i) {
            if (clusterMemberCount != 0)
                // for each numCoordsension
                #pragma omp simd //not sure this will help, the compiler may already vectorize that
                for (int j = 0; j < numCoords; ++j)
                    clustersCentroID[i*numCoords + j] /= clusterMemberCount[i];  /// XXXX will divide by zero here for any empty clusters!
                    // *** ^^ atomic is not needed
                    // *** only one thread will access each value of clusterCentroID

        }

        #pragma omp barrier
        /* ***
         * A barrier is needed otherwise the first thread arriving there will start to free the memory
         * Other threads may still be in the previous loop attempting to access localClusterMemberCount
         * If the pointer has been freed already, this will result in a segfault
         *
         * With the corrected code, the implicit barrier at the end of the distributed
         * for loop would be sufficient. With your initial code, an explicit barrier 
         * would have been needed.
         */

        // free memory
        #pragma omp single
        {
            // *** Need to free all pointers and not only the first one
            for (int i = 0; i < omp_get_num_threads(); ++i) free (localClusterMemberCount[i]);
            free (localClusterMemberCount);
        }
    }
    free(clusterMemberCount);