我正在编写一个简单的蒙特卡罗代码来模拟电子散射。我运行核心1000万电子并运行良好,但当我将电子数量增加到更高的数字,比如5000万时,代码就不会完成而计算机会冻结。我想知道这是硬件问题还是代码中可能存在错误。我正在使用ATI Radeon HD 5870在iMac上运行代码。
int rand_r (unsigned int seed)
{
unsigned int next = seed;
int result;
next *= 1103515245;
next += 12345;
result = (unsigned int) (next / 65536) % 2048;
next *= 1103515245;
next += 12345;
result <<= 10;
result ^= (unsigned int) (next / 65536) % 1024;
next *= 1103515245;
next += 12345;
result <<= 10;
result ^= (unsigned int) (next / 65536) % 1024;
seed = next;
return result;
}
__kernel void MC(const float E, __global float* bse, const int count) {
int tx, ty;
tx = get_global_id(0);
ty = get_global_id(1);
float RAND_MAX = 2147483647.0f;
int rand_seed;
int seed = count*ty + tx;
float rand;
float PI;
PI = 3.14159f;
float z;
z = 28.0f;
float rho;
rho = 8.908f;
float A;
A = 58.69f;
int num;
num = 10000000/(count*count);
int counter, counter1, counter2;
counter = 0;
float4 c_new, r_new;
float E_new, alpha, de_ds, phi, psi, mfp,sig_eNA,step, dsq, dsqi, absc0z;
float J;
J = (9.76f*z + 58.5f*powr(z,-0.19f))*1E-3f;
float4 r0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
float2 tilt = (float2)((70.0f/180.0f)*PI , 0.0f);
float4 c0 = (float4)(cos(tilt.y)*sin(tilt.x), sin(tilt.y)*sin(tilt.x), cos(tilt.x), 0.0f);
for (int i = 0; i < num; ++i){
rand_seed = rand_r(seed);
seed = rand_seed;
rand = rand_seed/RAND_MAX; //some random no. generator in gpu
r0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
c0 = (float4)(cos(tilt.y)*sin(tilt.x), sin(tilt.y)*sin(tilt.x), cos(tilt.x), 0.0f);
E_new = E;
c_new = c0;
alpha = (3.4E-3f)*powr(z,0.67f)/E_new;
sig_eNA = (5.21f * 602.3f)*((z*z)/(E_new*E_new))*((4.0f*PI)/(alpha*(1+alpha)))*((E_new + 511.0f)*(E_new + 511.0f)/((E_new + 1024.0f)*(E_new + 1024.0f)));
mfp = A/(rho*sig_eNA);
step = -mfp * log(rand);
r_new = (float4)(r0.x + step*c_new.x, r0.y + step*c_new.y, r0.z + step*c_new.z, 0.0f);
r0 = r_new;
counter1 = 0;
counter2 = 0;
while (counter1 < 1000){
alpha = (3.4E-3f)*powr(z,0.67f)/E_new;
sig_eNA = (5.21f * 602.3f)*((z*z)/(E_new*E_new))*((4*PI)/(alpha*(1+alpha)))*((E_new + 511.0f)*(E_new + 511.0f)/((E_new + 1024.0f)*(E_new + 1024.0f)));
mfp = A/(rho*sig_eNA);
rand_seed = rand_r(seed);
seed = rand_seed;
rand = rand_seed/RAND_MAX; //some random no. generator in gpu
step = -mfp * log(rand);
de_ds = -78500.0f*(z/(A*E_new)) * log((1.66f*(E_new + 0.85f*J))/J);
rand_seed = rand_r(seed);
seed = rand_seed;
rand = rand_seed/RAND_MAX; //new random no.
phi = acos(1 - ((2*alpha*rand)/(1 + alpha - rand)));
rand_seed = rand_r(seed);
seed = rand_seed;
rand = rand_seed/RAND_MAX; //third random no.
psi = 2*PI*rand;
if ((c0.z >= 0.999f) || (c0.z <= -0.999f) ){
absc0z = abs(c0.z);
c_new = (float4)(sin(phi) * cos(psi), sin(phi) * sin(psi), (c0.z/absc0z)*cos(phi), 0.0f);
}
else {
dsq = sqrt(1-c0.z*c0.z);
dsqi = 1/dsq;
c_new = (float4)(sin(phi)*(c0.x*c0.z*cos(psi) - c0.y*sin(psi))*dsqi + c0.x*cos(phi), sin(phi) * (c0.y * c0.z * cos(psi) + c0.x * sin(psi)) * dsqi + c0.y * cos(phi), -sin(phi) * cos(psi) * dsq + c0.z * cos(phi), 0.0f);
}
r_new = (float4)(r0.x + step*c_new.x, r0.y + step*c_new.y, r0.z + step*c_new.z, 0.0f);
r0 = r_new;
c0 = c_new;
E_new += step*rho*de_ds;
if (r0.z <= 0 && counter2 == 0){
counter++ ;
counter2 = 1;
}
counter1++ ;
}
}
bse[count*ty + tx] = counter;
}