最近,我将OpenMP实施到我们小组的项目代码中。主要运行两个for循环;外部控制'运行',而内部控制'运行'。生成完全独立于不同的运行,但在同一次运行中依赖于其他代。
这个想法是并行化外部循环,即“运行”循环,同时让每个线程在分配给它的任何特定运行编号上保持世代的进化。
当设置OMP_THREADS = 1
时,即让程序只用一个线程运行时,它会顺利运行。如果此数字更高,我会收到以下错误:
Unhandled exception at 0x00F5C4C3 in projectc.exe: 0xC0000005: Access violation writing location 0x00000072.
以下内容出现在Visual Studio的“Autos”部分中:
(注意:t
,t->active_cells
和t->cellx
为“错误红色”,其余为白色时出现此错误)
如果我将default(none)
更改为外部循环上方default(shared)
的{{1}},则移除#pragma
,t
和s
从bn
(这些是在外部文件中初始化的结构),然后程序在冻结之前在每个线程上正常运行一代(虽然CPU活动显示两个线程仍然以与之前相同的强度运行)。
我无法弄清楚出了什么问题。在外部循环之外尝试一个简单的threadprivate
当然不起作用,但我也尝试将所有main声明为#pragma omp parallel for
,将外部循环声明为#pragma omp parallel
。还尝试了其他一些微妙的方法,这使我得出结论,它必须与线程之间共享变量的方式有关...因为所有运行,所以线程都是独立的,所有这些都是变量可以设置为私有;虽然您在#pragma omp for
中看到了一些重叠。
代码附于下方。
shared(..)
/* General Includes */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <omp.h>
/* Project Includes */
#include "main.h"
#include "randgen.h"
#include "board7.h"
#include "tissue.h"
#include "io.h"
#define BitFlp(arg,posn) ((arg) ^ (1L << (posn)))
#define BitClr(arg,posn) ((arg) & ~(1L << (posn)))
#define display_dbg 1 //Controls whether print statements in main.c are displayed.
#define display_time 1 //Controls whether timing print statements are executed.
#define BILLION 1000000000L;
#define num_runs 10 //Controls number of runs per simulation
#define num_gens 4000//Controls number of generations per run
#define OMP_THREADS 1 // Max number of threads used if OpenMP is enabled
int n, i, r, j, z, x, sxa, y, flagb, m;
int j1, j2;
char a;
int max_fit_gen, collect_data, lb_run, w, rn, sx;
float f, max_fitness;
tissuen *fx;
input_vec dx;
calookup ra;
#pragma omp threadprivate(n, r, j, x, z, sxa, y, flagb, m, \
j1, j2, a, max_fit_gen, collect_data, lb_run, w, \
rn, sx, f, max_fitness, fx, dx, ra, run_data, t, s, bn)
int main(int argc, char *argv[])
{
int* p = 0x00000000; // pointer to NULL
char sa[256];
char ss[10];
long randn;
boardtable ba;
srand((unsigned)time(NULL));
init_mm();
randn = number_range(1, 100);
#ifdef OS_WINDOWS
// Timing parameters
LARGE_INTEGER clk_freq;
LARGE_INTEGER t1, t2, t3;
#endif
#ifdef OS_UNIX
struct timespec clk_freq, t1, t2, t3;
#endif
double avg_gen_time, avg_run_time, run_time, sim_time, est_run_time, est_sim_time;
// File System and IO Parameters
char cwd[FILENAME_MAX];
getcwd(&cwd, sizeof(cwd));
char curState[FILENAME_MAX];
char recState[FILENAME_MAX];
char recMode[FILENAME_MAX];
char curGen[FILENAME_MAX];
char curRun[FILENAME_MAX];
char genTmp[FILENAME_MAX];
strcpy(curState, cwd);
strcpy(recState, cwd);
strcpy(recMode, cwd);
strcpy(curGen, cwd);
strcpy(curRun, cwd);
strcpy(genTmp, cwd);
#ifdef OS_WINDOWS
strcat(curState, "\\current.txt");
strcat(recState, "\\recover.txt");
strcat(recMode, "\\recovermode.txt");
strcat(curGen, "\\gen.txt");
strcat(curRun, "\\run");
strcat(genTmp, "\\tmp\\gentmp");
#endif
#ifdef OS_UNIX
strcat(curState, "/current.txt");
strcat(recState, "/recover.txt");
strcat(recMode, "/recovermode.txt");
strcat(curGen, "/gen.txt");
strcat(curRun, "/run");
strcat(genTmp, "/tmp/gentmp");
#endif
//Read current EA run variables (i.e. current run number, generation, recover mode status)
z = readorcreate(curState);
x = readorcreate(recState);
sxa = readorcreate(recMode);
y = readorcreate(curGen);
//Initialize simulation parameters
s.count = 0;
s.x[0] = 0;
s.y[0] = 0;
s.addvec[0] = 0;
s.bestnum = 0;
s.countb = 0;
s.count = 0;
initialize_sim_param(&s, 0, 200);
collect_data = 0;
//Build a collection of experiment initial conditions
buildboardcollection7(&bn);
//Determine clock frequency.
#ifdef OS_WINDOWS
if (display_time) get_frequency(&clk_freq);
#endif
#ifdef OS_UNIX
if (display_time) get_frequency(CLOCK_REALTIME, &clk_freq);
#endif
//Start simulation timer
#ifdef OS_WINDOWS
if (display_time) read_clock(&t1);
#endif
#ifdef OS_UNIX
if (display_time) read_clock(CLOCK_REALTIME, &t1);
#endif
#pragma omp parallel for schedule(static) default(none) num_threads(OMP_THREADS) \
private(sa, ss, randn, ba, t2, t3, avg_gen_time, avg_run_time, sim_time, \
run_time, est_run_time, est_sim_time) \
shared(i, cwd, recMode, curRun, curGen, curState, genTmp, clk_freq, t1)
for (i = z; i < num_runs; i++)
{
// randomly initialize content of tissue population
initialize_tissue_pop_s2(&(t.tgen[0]), &s);
initialize_tissue_pop_s2(&(t.tgen[1]), &s);
max_fit_gen = 0;
max_fitness = 0.0;
flagb = 0;
if ((i == z) && (x == 1))
{
w = y;
}
else
{
w = 0;
}
rn = 200;
j1 = 0;
s.run_num = i;
s.maxfitness = 0.0;
//Start run timer
#ifdef OS_WINDOWS
if (display_time) read_clock(&t2);
#endif
#ifdef OS_UNIX
if (display_time) read_clock(CLOCK_REALTIME, &t2);
#endif
#if defined(_OPENMP)
printf("\n ======================================= \n");
printf(" OpenMP Status Message \n");
printf("\n --------------------------------------- \n");
printf("| RUN %d : \n", i);
printf("| New Thread Process (Thread %d) \n", omp_get_thread_num());
printf("| Available Threads: %d of %d \n", omp_get_num_threads(), omp_get_max_threads());
printf(" ======================================= \n\n");
#endif
for (j = w; j < num_gens; j++)
{
// Flips on lightboard data collection. See board7.h.
if (enable_collection == 1) {
if ((i >= run_collect) && (j >= gen_collect)) { collect_data = 1; }
}
sx = readcurrent(recMode);
// Pseudo loop code. Uses bit flipping to cycle through boards.
j2 = ~(j1)& 1;
if (display_dbg) printf("start evaluation...\n");
// evaluate tissue
// Most of the problems in the code happen here.
evaluatepopulation_tissueb(&(t.tgen[j1]), &ra, &bn, &s, j, i);
if (display_dbg) printf("\n");
// display fitness stats to screen
printmaxfitness(&(t.tgen[j1]), i, j, j1, &cwd);
if (display_dbg) printf("start tournament...\n");
// Perform tournament selection and have children ready for evaluation
// Rarely have to touch. Figure out best parents. Crossover operator.
// Create a subgroup. Randomly pick individuals from the population.
// Pick fittest individuals out of the random group.
// 2 parents and 2 children. Children replace parents.
tournamentsel_tissueb(&(t.tgen[j1]), &(t.tgen[j2]), &s);
printf("Tournament selection complete.\n");
// keep track of best fitness during run
if (t.tgen[j1].fit_max > max_fitness)
{
max_fitness = t.tgen[j1].fit_max;
max_fit_gen = j;
}
if ((t.tgen[j1].fit_max > 99.0) && (flagb == 0))
{
flagb = 1;
run_data.fit90[i] = t.tgen[j1].fit_max;
run_data.gen90[i] = j;
}
sa[0] = 0;
strcat(sa, curRun);
sprintf(ss, "%d", i);
strcat(sa, ss);
strcat(sa, ".txt");
printf("Write fitness epc...\n");
// write fitness stats to file
writefitnessepc(sa, &(t), j1, j);
printf("Write fitness complete.\n");
// trunk for saving population to disk
if (sx != 0)
{
sa[0] = 0;
strcat(sa, genTmp);
sprintf(ss, "%d", 1);
strcat(sa, ss);
strcat(sa, ".txt");
if (display_dbg) printf("Saving Current Run\n");
}
//update current generation to file
writecurrent(curGen, j + 1);
if (display_time && j > 0 && (j % 10 == 0 || j % (num_gens - 1) == 0))
{
#ifdef OS_WINDOWS
read_clock(&t3);
sim_time = (t3.QuadPart - t1.QuadPart) / clk_freq.QuadPart;
run_time = (t3.QuadPart - t2.QuadPart) / clk_freq.QuadPart;
#endif
#ifdef OS_UNIX
read_clock(CLOCK_REALTIME, &t3);
sim_time = (double)(t3.tv_sec - t1.tv_sec);
run_time = (double)(t3.tv_sec - t2.tv_sec);
#endif
avg_gen_time = run_time / (j + 1);
est_run_time = avg_gen_time * (num_gens - j);
avg_run_time = est_run_time + run_time;
est_sim_time = (est_run_time * (num_runs - i)) / (i + 1);
printf("\n============= Timing Data =============\n");
printf("Time in Simulation: %.2fs\n", sim_time);
printf("Time in Run: %.2fs\n", run_time);
printf("Est. Time to Complete Run: %.2fs\n", est_run_time);
printf("Est. Time to Complete Simulation: %.2fs\n\n", est_sim_time);
printf("Average Time Per Generation: %.2fs/gen\n", avg_gen_time);
printf("Average Time Per Run: %.2fs/run\n", avg_run_time);
printf("=======================================\n\n");
if (j % (num_gens - 1) == 0) {
}
}
//Display Position Board
//displayboardl(&bn.board[0]);
j1 = j2;
}
}
}
答案 0 :(得分:3)
代码对于正确的测试而言太大了,使用全局变量确实无法弄清楚数据依赖性。不过我可以发表一些评论:
shared
被声明为private
,而它是并行化循环的索引。这是错的!如果在omp for
循环中确实存在一个您真正想要shared
的变量,那么它就是循环索引。我在C和C ++的OpenMP标准中没有发现任何明确的内容,而对于Fortran,循环索引(以及所有封闭循环中的循环索引)是隐式私有化的。尽管如此,英特尔编译器在尝试显式声明sharedi.cc(11): warning #2555: static control variable for parallel loop
for ( i=0; i<10; i++ ) {
^
sharedi.cc(10): error: index variable "i" of for statement following an OpenMP for pragma must be private
#pragma omp parallel for shared(i) schedule(static)
^
compilation aborted for sharedi.cc (code 2)
这样的索引时会出错:
private
通过平均时间,gcc版本5.1.0不会为相同的代码发出任何警告或错误,并且表现为变量已被声明为i
...我倾向于找到英特尔的编译器行为更合理,但我不能100%确定哪一个是正确的。但我所知道的是,声明shared
private
绝对是一个非常糟糕的主意(甚至是AFAIC的错误)。所以我觉得这是一个灰色区域,你的编译器可能会或者可能不会做一个明智的工作,这本身就可以解释你的大部分问题。
您似乎将数据输出到文件中,这些名称可能会跨线程冲突。小心一点,因为你最终会弄得一团糟......
您的打印很可能全部搞砸了。我不知道你对此有多重视,但这不会是现在的写作方式。
总而言之,您的代码只是纠结于我,以便清楚地了解正在发生的事情。尝试至少解决我提到的两个第一点,它可能足以让它“ work ”。但是,我不能鼓励你清理代码和摆脱全局变量。同样,尽量只在源代码中尽可能地声明变量,因为这样可以减少为OpenMP声明它们{{1}}的需要,并且可以大大提高可读性。
祝你好好调试。