我使用pthreads编写了一个简单的平流方程求解器,它在处理器上正常工作。但是当我使用-mmic标志编译它并在协处理器上运行时(使用micnativeloadex),它只使用一个线程(我很难编码它使用200)。据我所知,代码应该按原样运行。我在这里错过了什么吗?
代码很脏,但仍然是完整的。
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
#include <sys/time.h>
#include <unistd.h>
int64_t TimeInMicros() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec*1000000 + tv.tv_usec;
}
struct Grid{
int nx;
double *u, *u_new, *f, *res;
double a, cfl, dx;
double tf;
};
struct ThreadData{
int tid;
struct Grid *grid;
int maxthreads;
pthread_barrier_t *barr;
};
void *solver(void *args){
struct ThreadData *td = (struct ThreadData *)args;
int tid = td->tid;
struct Grid *grid = td->grid;
pthread_barrier_t *barr = td->barr;
double *u = grid->u;
double *u_new = grid->u_new;
double cfl = grid->cfl;
double a = grid->a;
double dx = grid->dx;
double tf = grid->tf;
int nx = grid->nx;
double *f = grid->f;
double *res = grid->res;
double dt = cfl*dx/a;
double t = 0.0;
int chunk = nx/(td->maxthreads);
int start = tid*chunk;
int rc;
while(t < tf){
// sync here
if(start == 0){
f[start+1:chunk] = a*u[start:chunk];
}
else{
f[start:chunk+1] = a*u[start-1:chunk+1];
}
// sync here
rc = pthread_barrier_wait(barr);
if(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD)
{
printf("Could not wait on barrier\n");
exit(-1);
}
if(start == 0){
f[start] = f[nx-1];
}
res[start:chunk] = -(f[start+1:chunk] - f[start:chunk])/dx;
// need to use update u_new for multiple threads
u[start:chunk] += res[start:chunk]*dt;
rc = pthread_barrier_wait(barr);
if(rc != 0 && rc != PTHREAD_BARRIER_SERIAL_THREAD)
{
printf("Could not wait on barrier\n");
exit(-1);
}
t+=dt;
}
return NULL;
}
int main(int argc, char*argv[]){
int nx=100000;//atoi(argv[1]);
int nthreads=200;//atoi(argv[2]);
if(nx%nthreads != 0){
printf("ERROR: Number of cells should be integral multiple of number of threads \n");
exit(1);
}
pthread_t *threads = new pthread_t[nthreads]();
struct ThreadData td[nthreads];
pthread_barrier_t barr;
pthread_barrier_init(&barr, NULL, nthreads);
double *u = new double[nx]();
double *res = new double[nx]();
double *f = new double[nx+1]();
double dx = 1.0/nx;
double cfl = 0.9;
double a = 1.0;
double tf = 1.0;
int i;
// initialize
u[0:nx] = 0.0;
u[nx/4:nx/2] = 1.0;
f[0:nx+1] = 0.0;
res[0:nx] = 0.0;
struct Grid grid;
grid.nx = nx;
grid.a = a;
grid.cfl = cfl;
grid.dx = dx;
grid.u = u;
grid.u_new = u;
grid.res = res;
grid.f = f;
grid.tf = tf;
for(i=0;i<nthreads;i++){
td[i].tid = i;
td[i].grid = &grid;
td[i].maxthreads = nthreads;
td[i].barr = &barr;
}
int64_t t1 = TimeInMicros();
for(i=0;i<nthreads;i++){
pthread_create(&threads[i],NULL,solver,&(td[i]));
}
for(i=0;i<nthreads;i++){
pthread_join(threads[i],NULL);
}
int64_t t2 = TimeInMicros();
printf("Execution time: %.10f\n", (t2-t1)*1e-6);
FILE * outfile;
outfile = fopen("results.txt", "w+");
for(i = 0; i < nx; i++){
fprintf(outfile, "%.10f %.10f\n", i*dx, grid.u[i]);
}
fclose(outfile);
delete[] threads;
delete[] u;
delete[] res;
delete[] f;
}