OpenCL内核没有完成执行

时间:2014-04-11 17:43:31

标签: crash opencl gpu

我正在编写一个简单的蒙特卡罗代码来模拟电子散射。我运行核心1000万电子并运行良好,但当我将电子数量增加到更高的数字,比如5000万时,代码就不会完成而计算机会冻结。我想知道这是硬件问题还是代码中可能存在错误。我正在使用ATI Radeon HD 5870在iMac上运行代码。

    int rand_r (unsigned int seed)
{
    unsigned int next = seed;
    int result;

    next *= 1103515245;
    next += 12345;
    result = (unsigned int) (next / 65536) % 2048;

    next *= 1103515245;
    next += 12345;
    result <<= 10;
    result ^= (unsigned int) (next / 65536) % 1024;

    next *= 1103515245;
    next += 12345;
    result <<= 10;
    result ^= (unsigned int) (next / 65536) % 1024;

    seed = next;

    return result;
}

__kernel void MC(const float E, __global float* bse, const int count) {
    int tx, ty;
    tx = get_global_id(0);
    ty = get_global_id(1);


    float RAND_MAX = 2147483647.0f;
    int rand_seed;
    int seed = count*ty + tx;
    float rand;

    float PI;
    PI = 3.14159f;

    float z;
    z = 28.0f;

    float rho;
    rho = 8.908f;

    float A;
    A = 58.69f;

    int num;
    num = 10000000/(count*count);

    int counter, counter1, counter2;
    counter = 0;

    float4 c_new, r_new;
    float E_new, alpha, de_ds, phi, psi, mfp,sig_eNA,step, dsq, dsqi, absc0z;

    float J;
    J = (9.76f*z + 58.5f*powr(z,-0.19f))*1E-3f;

    float4 r0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
    float2 tilt = (float2)((70.0f/180.0f)*PI , 0.0f);
    float4 c0 = (float4)(cos(tilt.y)*sin(tilt.x), sin(tilt.y)*sin(tilt.x), cos(tilt.x), 0.0f);



    for (int i = 0; i < num; ++i){

        rand_seed = rand_r(seed);
        seed = rand_seed;
        rand = rand_seed/RAND_MAX; //some random no. generator in gpu
        r0 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
        c0 = (float4)(cos(tilt.y)*sin(tilt.x), sin(tilt.y)*sin(tilt.x), cos(tilt.x), 0.0f);
        E_new = E;
        c_new = c0;
        alpha = (3.4E-3f)*powr(z,0.67f)/E_new;
        sig_eNA = (5.21f * 602.3f)*((z*z)/(E_new*E_new))*((4.0f*PI)/(alpha*(1+alpha)))*((E_new + 511.0f)*(E_new + 511.0f)/((E_new + 1024.0f)*(E_new + 1024.0f)));
        mfp = A/(rho*sig_eNA);
        step = -mfp * log(rand);
        r_new = (float4)(r0.x + step*c_new.x, r0.y + step*c_new.y, r0.z + step*c_new.z, 0.0f);
        r0 = r_new;
        counter1 = 0;
        counter2 = 0;

        while (counter1 < 1000){
            alpha = (3.4E-3f)*powr(z,0.67f)/E_new;
            sig_eNA = (5.21f * 602.3f)*((z*z)/(E_new*E_new))*((4*PI)/(alpha*(1+alpha)))*((E_new + 511.0f)*(E_new + 511.0f)/((E_new + 1024.0f)*(E_new + 1024.0f)));
            mfp = A/(rho*sig_eNA);
            rand_seed = rand_r(seed);
            seed = rand_seed;
            rand = rand_seed/RAND_MAX; //some random no. generator in gpu
            step = -mfp * log(rand);
            de_ds = -78500.0f*(z/(A*E_new)) * log((1.66f*(E_new + 0.85f*J))/J);
            rand_seed = rand_r(seed);
            seed = rand_seed;
            rand = rand_seed/RAND_MAX; //new random no.
            phi = acos(1 - ((2*alpha*rand)/(1 + alpha - rand)));
            rand_seed = rand_r(seed);
            seed = rand_seed;
            rand = rand_seed/RAND_MAX; //third random no.
            psi = 2*PI*rand;

            if ((c0.z >= 0.999f) || (c0.z <= -0.999f) ){
                absc0z = abs(c0.z);
                c_new = (float4)(sin(phi) * cos(psi), sin(phi) * sin(psi), (c0.z/absc0z)*cos(phi), 0.0f);
                }
            else {
                dsq = sqrt(1-c0.z*c0.z);
                dsqi = 1/dsq;
                c_new = (float4)(sin(phi)*(c0.x*c0.z*cos(psi) - c0.y*sin(psi))*dsqi + c0.x*cos(phi), sin(phi) * (c0.y * c0.z * cos(psi) + c0.x * sin(psi)) * dsqi + c0.y * cos(phi), -sin(phi) * cos(psi) * dsq + c0.z * cos(phi), 0.0f);
                }

            r_new = (float4)(r0.x + step*c_new.x, r0.y + step*c_new.y, r0.z + step*c_new.z, 0.0f);
            r0 = r_new;
            c0 = c_new;
            E_new += step*rho*de_ds; 
            if (r0.z <= 0 && counter2 == 0){
                  counter++ ;
                counter2 = 1;
            }
            counter1++ ;
        }


     }
    bse[count*ty + tx] = counter;
}

0 个答案:

没有答案