需要帮助多线程一个超级简单但超级优雅的练习曲!
如下所示,评论的9行是通用的最长的Common SubString 循环实现,而下面的片段是无分支的SSE2对应物。练习曲工作得很好,但是当试图多线程时(尝试了几种方式) - IT报告随机纠正或不正确的结果?!
#ifdef KamXMM
printf("Branchless 128bit Assembly struggling ...\n");
for(i=0; i < size_inLINESIXFOUR2; i++){
XMMclone = _mm_set1_epi8(workK2[i]);
//omp_set_num_threads(4);
#ifdef Commence_OpenMP
//#pragma omp parallel for shared(workK,PADDED32,Matrix_vectorCurr,Matrix_vectorPrev) private(j,ThreadID) // Sometimes reports correctly sometimes NOT?!
#endif
for(j=0; j < PADDED32; j+=(32/2)){
XMMprev = _mm_loadu_si128((__m128i*)(Matrix_vectorPrev+(j-1)));
XMMcurr = _mm_loadu_si128((__m128i*)&workK[j]);
XMMcmp = _mm_cmpeq_epi8(XMMcurr, XMMclone);
XMMand = _mm_and_si128(XMMprev, XMMcmp);
XMMsub = _mm_sub_epi8(XMMzero, XMMcmp);
XMMadd = _mm_add_epi8(XMMand, XMMsub);
_mm_storeu_si128((__m128i*)(Matrix_vectorCurr+j), XMMadd);
// This doesn't work, sometimes reports 24 sometimes 23, (for Carlos vs Japan):
//ThreadID=omp_get_thread_num();
//if (ThreadID==0) XMMmax0 = _mm_max_epu8(XMMmax0, XMMadd);
//if (ThreadID==1) XMMmax1 = _mm_max_epu8(XMMmax1, XMMadd);
//if (ThreadID==2) XMMmax2 = _mm_max_epu8(XMMmax2, XMMadd);
//if (ThreadID==3) XMMmax3 = _mm_max_epu8(XMMmax3, XMMadd);
{
XMMmax = _mm_max_epu8(XMMmax, XMMadd);
}
// if(workK[j] == workK2[i]){
// if (i==0 || j==0)
// *(Matrix_vectorCurr+j) = 1;
// else
// *(Matrix_vectorCurr+j) = *(Matrix_vectorPrev+(j-1)) + 1;
// if(max < *(Matrix_vectorCurr+j)) max = *(Matrix_vectorCurr+j);
// }
// else
// *(Matrix_vectorCurr+j) = 0;
}
// XMMmax = _mm_max_epu8(XMMmax, XMMmax0);
// XMMmax = _mm_max_epu8(XMMmax, XMMmax1);
// XMMmax = _mm_max_epu8(XMMmax, XMMmax2);
// XMMmax = _mm_max_epu8(XMMmax, XMMmax3);
_mm_storeu_si128((__m128i*)vector, XMMmax); // No need since it was last, yet...
for(k=0; k < 32/2; k++)
if ( max < vector[k] ) max = vector[k];
if (max >= 255) {printf("\nWARNING! LCSS >= 255 found, cannot house it within BYTE long cell! Exit.\n"); exit(13);}
printf("%s; Done %d%% \r", Auberge[Melnitchka++], (int)(((double)i*100/size_inLINESIXFOUR2)));
Melnitchka = Melnitchka & 3; // 0 1 2 3: 00 01 10 11
Matrix_vectorSWAP=Matrix_vectorCurr;
Matrix_vectorCurr=Matrix_vectorPrev;
Matrix_vectorPrev=Matrix_vectorSWAP;
}
#endif
我的愿望是将它提升到内存带宽的范围,在我的笔记本电脑上使用i5-7200u它以5GB / s的速度遍历行,而memcpy()
则在12GB / s的某个位置。
我对OpenMP的理解是肤浅的,我设法使用多线程(带有#pragma omp sections nowait
)非向量代码,但是向量是有问题的,如何告诉编译器XMMmax
必须是私有的? !