OpenMP的执行时间是否取决于迭代次数或结构块的大小?我想使用Pocket-shpinx优化函数,但在程序完成后,可以看到它耗费了更多的时间。任何人都可以解释一下我的错在哪里
提前致谢。
我只附加了一小部分代码
static int32
get_scores_4b_feat_4(s2_semi_mgau_t * s, int i,
int16 *senone_scores, uint8 *senone_active,
int32 n_senone_active)
{
int32 j, xyz;
uint8 *pid_cw0, *pid_cw1, *pid_cw2, *pid_cw3;
uint8 w_den[4][16];
/* Precompute scaled densities. */
for (j = 0; j < 16; ++j) {
w_den[0][j] = s->mixw_cb[j] + s->f[i][0].score;
w_den[1][j] = s->mixw_cb[j] + s->f[i][1].score;
w_den[2][j] = s->mixw_cb[j] + s->f[i][2].score;
w_den[3][j] = s->mixw_cb[j] + s->f[i][3].score;
}
pid_cw0 = s->mixw[i][s->f[i][0].codeword];
pid_cw1 = s->mixw[i][s->f[i][1].codeword];
pid_cw2 = s->mixw[i][s->f[i][2].codeword];
pid_cw3 = s->mixw[i][s->f[i][3].codeword];
#pragma omp parallel for\
shared(s,w_den,senone_active,senone_scores)
for ( j = 0; j < n_senone_active; j++) {
int n = 0;
#pragma omp parallel for\
reduction(+:n)\
firstprivate(senone_active)
for(xyz = 0;xyz<=j;xyz++)
n = senone_active[xyz] + n ;
int tmp, cw;
if (n & 1) {
cw = pid_cw0[n/2] >> 4;
tmp = w_den[0][cw];
cw = pid_cw1[n/2] >> 4;
tmp = fast_logmath_add(s->lmath_8b, tmp, w_den[1][cw]);
cw = pid_cw2[n/2] >> 4;
tmp = fast_logmath_add(s->lmath_8b, tmp, w_den[2][cw]);
cw = pid_cw3[n/2] >> 4;
tmp = fast_logmath_add(s->lmath_8b, tmp, w_den[3][cw]);
}
else {
cw = pid_cw0[n/2] & 0x0f;
tmp = w_den[0][cw];
cw = pid_cw1[n/2] & 0x0f;
tmp = fast_logmath_add(s->lmath_8b, tmp, w_den[1][cw]);
cw = pid_cw2[n/2] & 0x0f;
tmp = fast_logmath_add(s->lmath_8b, tmp, w_den[2][cw]);
cw = pid_cw3[n/2] & 0x0f;
tmp = fast_logmath_add(s->lmath_8b, tmp, w_den[3][cw]);
}
senone_scores[n] += tmp;
}
return 0;
}