Question

我目前正在使用MPI创建并行化代码，以找出任何给定图形中有多少个三角形。到目前为止，我的代码能够成功获得正确数量的三角形（我知道，因为我有一个相同代码的序列化版本，运行速度慢很多）直到某一点。我会说在大约6000个节点之后我的线程不再在函数内被读取（我通过查看主函数中的线程与计算线程是否使其成为函数来发现这一点）。

作为参考，代码本身包含许多节点，边，种子和度。

为了这个目的，我们将忽略种子和程度，因为当一切正常时它们完美地工作。

编辑：我将快速解释那些丢失的变量命名约定。基本上我们给出了一个多个节点的图形以及连接它们的边缘（想想邻接列表）。现在，程序的工作是遍历每个顶点u，并在图形中取另一个顶点v，以查找它们是否具有连接它们之间的相应顶点w。在这种情况下，由于C中没有bool，我们将使用int的边缘uv，uw和vw。这样，如果有连接边，那么我们可以将它们变为1.如果它们都是1，那么我们找到了一个三角形，现在可以将它添加到全局变量中。让我们知道，在这里找出三角形的for循环中的代码是100％正确的并且不是问题的问题。相反，问题涉及在更高节点处使用pthreads的问题。

以下是代码：

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

#include "graph.h"

#define MAX_N       1000000
#define MAX_E       2*MAX_N // must be at least twice MAX_N

GRAPH_t * G;    
#define MAX_THREADS     65536
#include <pthread.h>


int thread_id[MAX_THREADS]; // User defined id for thread
pthread_t p_threads[MAX_THREADS];// Threads
pthread_attr_t attr;        // Thread attributes 

pthread_mutex_t lock_count; // Protects minimum, count

unsigned int parallelCount = 0;
unsigned int threadCount = 0;
void *count_ParallelTriangles(void *threadID) {

  int u = *((int *)threadID);
  int counter = 0;

  unsigned int v, w, e, uv, uw, vw;
  for (v = u + 1; v < G->n; v++) {
    uv = 0;
    for (e = G->V_ptr[v]; e < G->V_ptr[v + 1]; e++) {
        if (G->E_v[e] == u) {
            uv = 1;         // Edge (u,v) exists
        }
    }
    if (uv == 1) {
        for (w = v + 1; w < G->n; w++) {
            uw = 0; vw = 0;
            for (e = G->V_ptr[w]; e < G->V_ptr[w + 1]; e++) {
                if (G->E_v[e] == u) uw = 1;     // Edge (u,w) exists
                if (G->E_v[e] == v) vw = 1;     // Edge (v,w) exists
            }
            if ((uv == 1) && (vw == 1) && (uw == 1)) {
                counter += 1;
            }
        }
    }
  }
  //if (counter > 0) {
    pthread_mutex_lock(&lock_count);
    threadCount += 1;
    parallelCount += counter;
    pthread_mutex_unlock(&lock_count);
  //}

  pthread_exit(NULL);
}

及以下是在主

中调用的地方

int main(int argc, char *argv[]) {

struct timespec start, stop;
float time_serial;

unsigned int num_nodes, num_edges, seed, num_triangles, max_degree;

if (argc != 5) {
  printf("Use: <executable_name> <num_nodes> <num_edges> <seed> 
  <max_degree>\n"); 
  exit(0);
}
if ((num_nodes = atoi(argv[argc-4])) > MAX_N) {
  printf("Maximum number of nodes allowed: %u\n", MAX_N);
  exit(0);
}; 
if ((num_edges = atoi(argv[argc-3])) > MAX_E) {
  printf("Maximum number of edges allowed: %u\n", MAX_E);
  exit(0);
}; 
if (num_edges < 2*num_nodes) {
  num_edges = 2*num_nodes;
  printf("Number of edges must be at least twice the number of nodes: changing 
  num_edges to %u\n", num_edges);
  exit(0);
}; 
seed = atoi(argv[argc-2]);
max_degree = atoi(argv[argc-1]);

// Initialize graph
G = init_graph ( num_nodes, num_edges, seed, max_degree );

float time_parallel;

//Initialize Pthread
pthread_mutex_init(&lock_count, NULL);
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);

clock_gettime(CLOCK_REALTIME, &start);
for (int u = 0; u < num_nodes; u++) {
    thread_id[u] = u;
    pthread_create(&p_threads[u], &attr, count_ParallelTriangles, (void 
  *)&thread_id[u]);
  }
  for (int u = 0; u < num_nodes; u++) {
    pthread_join(p_threads[u], NULL);
  }

  clock_gettime(CLOCK_REALTIME, &stop);

  time_parallel = (stop.tv_sec - start.tv_sec)
    + 0.000000001*(stop.tv_nsec - start.tv_nsec);
  printf("Thread active: %d\n", threadCount);
  printf("Parallel execution time = %f s\n", time_parallel);
  // Print results
  printf("G: Nodes = %u, Edges = %u, Triangles = %u\n", G->n, G->m, parallelCount);

  pthread_attr_destroy(&attr);
  pthread_mutex_destroy(&lock_count);

  return 0;
}

最后，这是正确运行时的输出。边缘设置为最大值1000000，因此它计算它可以适应的平均度数的最大边数，在这种情况下是234110.

./triangles.exe 4096 1000000 0 0
Serial execution time = 16.181034 s
G: Nodes = 4096, Edges = 234110, Triangles = 651015
Thread active: 4096
Parallel execution time = 0.843587 s
G: Nodes = 4096, Edges = 234110, Triangles = 651015

我们可以看到上面的工作正常，因为线程数量等于声明的节点数量。但是，如果我们将节点增加几千，我们会看到输出不再正确运行，尽管仍然很快：

./triangles.exe 6000 1000000 0 0
Serial execution time = 48.326824 s
G: Nodes = 6000, Edges = 413845, Triangles = 1207058
Thread active: 2061
Parallel execution time = 1.471421 s
G: Nodes = 6000, Edges = 413845, Triangles = 1079834

在上面的例子中，如果我们再多次运行这个例子，那么线程的数量会在每次调用之间发生变化，而它计算的三角形也会随之改变（因为每个三角形的数量取决于正确传递它的线程）到全局变量）。序列化的计数和时间将保持相对一致，因为它已经是正确的。

编辑：为主文件添加了更多代码。

以下是用于创建图表的头文件

typedef struct _graph {
  unsigned int n;       // Number of vertices in the graph
  unsigned int m;       // Number of edges in the graph
  unsigned int * E_u;       // Edge i = (E_u[i],E_v[i]) 
  unsigned int * E_v;       // 
  unsigned int * V_ptr; // Edges incident on vertex u 
  // have ids V_ptr[u] ... V_ptr[u+1]-1
} GRAPH_t; 

GRAPH_t * init_graph ( unsigned int n, unsigned int m, unsigned int seed, 
unsigned int max_degree ) {
GRAPH_t * G = (GRAPH_t *) calloc(1, sizeof(GRAPH_t)); 
unsigned u, v, e, nbrs, first, lastplus1, maxvalue;
double fraction;
G->n = n;
G->E_u = (unsigned int *) calloc(m, sizeof(unsigned int)); 
G->E_v = (unsigned int *) calloc(m, sizeof(unsigned int)); 
G->V_ptr = (unsigned int *) calloc((G->n+1), sizeof(unsigned int)); 

srand48(seed); 
unsigned int count = 0; 
// Generate edges 
G->V_ptr[0] = count;

for (u = 1; u < G->n; u++) {

G->V_ptr[u] = count;

switch (max_degree) {
    case 0:         // max_degree = 0 => max_degree = sqrt(n)
    nbrs = sqrt(G->n); if (nbrs > u) nbrs = u; 
    break;
    default:
    nbrs = max_degree; if (nbrs > u) nbrs = u;
    break;
 }

 first = G->V_ptr[u]; 
 lastplus1 = first + nbrs; if (lastplus1 > m) lastplus1 = m;

 if (first < lastplus1) {

   for (e = first; e < lastplus1; e++) 
     G->E_v[e] = ((unsigned int) lrand48()) % G->n;

     maxvalue = G->E_v[first]; 
     for (e = first+1; e < lastplus1; e++) {
       G->E_v[e] += G->E_v[e-1];
       maxvalue = G->E_v[e];
     }

     for (e = first; e < lastplus1; e++) {
       fraction = ((double) G->E_v[e])/(maxvalue+1+(lrand48()%G->n));
       G->E_v[e] = (unsigned int) (fraction * u); 
     }

     // Generate edges incident at u 
     G->E_u[count] = u; 
     G->E_v[count] = G->E_v[count]; 
     count++;
     for (e = first+1; e < lastplus1; e++) {
      if (G->E_v[count-1] < G->E_v[e]) {
         G->E_u[count] = u; 
         G->E_v[count] = G->E_v[e]; 
         count++;
       }
     }
   }
 }
 G->V_ptr[n] = count;
 G->m = count-1;        // Initialize number of edges

 // Check graph
 for (u = 0; u < G->n; u++) {
   if (G->V_ptr[u] > G->V_ptr[u+1]) {
      printf("Graph generation problem - 1!!!\n"); 
      exit(0); 
   }
   for (e = G->V_ptr[u]; e < G->V_ptr[u+1]; e++) {
      if (G->E_u[e] != u) {
        printf("Graph generation problem - 2!!!\n"); 
        exit(0); 
      }
      if (G->E_v[e] >= u) {
        printf("Graph generation problem - 3!!!\n"); 
        exit(0); 
      }
      if ((e > G->V_ptr[u]) && (G->E_v[e] <= G->E_v[e-1])) {
        printf("Graph generation problem - 4!!!\n"); 
        exit(0); 
      }
    }
  }
  return G;
}

Answer 1

我意识到我的愚蠢，我忘记了超级计算机在命令行上执行程序的线程限制，如果在“专用模式”下使用批处理文件执行，它可以超过大约4096个线程。在使用批处理文件测试之后，我以我的方式意识到错误，这确实是我的解决方案。很抱歉给您带来不便！希望这些信息可以帮助其他用户检查超级计算机的多线程计算策略！感谢Giles让我检查错误代码，因为我没有意识到没有错误代码告诉我我在4096“跑出”线程（尽管有大约65,536哈哈）。一旦我能够，我将结束这个问题。

Pthreads：我的并行代码在一定量

1 个答案: