所以我的线程代码是:
DWORD WINAPI ThreadFunc1(LPVOID lpParam )
{
THREAD_DATA *ptrDat = (THREAD_DATA *)(lpParam);
int loc_N = ptrDat->loc_N ;
int ntimes = ptrDat->ntimes;
__m128d rx0, ry0, result0;
for( int ip= 0; ip < ntimes; ip++ ) {
result0 = _mm_setzero_pd();
if (loc_N%2 != 0){
rx0 = _mm_load_sd(ptrDat->X);
ry0 = _mm_load_sd(ptrDat->Y);
ry0 = _mm_mul_pd(rx0, ry0);
result0 = _mm_add_pd(result0, ry0);
}
for( int i = loc_N%2; i < loc_N; i+=2 ) {
rx0 = _mm_load_pd(ptrDat->X+i);
ry0 = _mm_load_pd(ptrDat->Y+i);
ry0 = _mm_mul_pd(rx0, ry0);
result0 = _mm_add_pd(result0, ry0);
}
_mm_storeh_pd ( &ptrDat->res, _mm_add_pd(result0, _mm_shuffle_pd( result0, result0,1 )));
}
ptrDat->ret = 0;
return 0;
}
有片段主要功能:
loc_N = N/np;
N = loc_N*np;
try
{
X = new double[N];
Y = new double[N];
}
catch(bad_alloc aa)
{
cout << "memory allocation error\n";
system("pause");
exit(1);
}
//preparation of X, Y
int i;
for(i=0; i<N; i++)
{
X[i] = (double)(i+1);
Y[i] = 1.0;
}
for(ip=0; ip<np; ip++)
{
tDat[ip].loc_N = loc_N;
tDat[ip].N = N;
tDat[ip].ntimes = ntimes;
tDat[ip].X = X + ip*loc_N;
tDat[ip].Y = Y + ip*loc_N;
tDat[ip].threadNo = ip;
hThread[ip] = CreateThread(
NULL,
0,
ThreadFunc1,
(void*)&tDat[ip],
0,
NULL
);
if( !hThread[ip] ) {
exit(1);
}
}
loc_N是每个线程的元素数。 N个大小的向量x和y。 ntimes - 是一些重复算法。 threadNo - 线程号。
我不知道为什么当我有三个线程时程序崩溃在ry0 = _mm_load_pd(ptrDat-&gt; y + i);