OpenCL AMD GPU编译器崩溃

时间:2014-08-19 10:03:05

标签: crash kernel opencl gpu amd-processor

我正在研究一个内核,它可以找到光线和三角形列表之间的交叉点,但是(总有一个"但是")我在使用opencl编译器时遇到了一些问题,当我尝试编译时它真的崩溃了它

我尝试在我的CPU编译器上编译它并且编译得很好,但是使用我的GPU编译器它会崩溃......

//-----------------------------------------------------------------------------  
//---------------------------------DEFINES-------------------------------------  
//-----------------------------------------------------------------------------  

#define RAYON_SORTANT -1000  
#define RAYON_ENTRANT 1000  

#define MIN_LONGUEUR_RT 1.E-6f  

//-----------------------------------------------------------------------------  
//---------------------------------CONTENT-------------------------------------  
//-----------------------------------------------------------------------------  

typedef struct s_CDPoint  
{  
    float x;  
    float y;  
    float z;  
} CDPoint;  

typedef struct s_TTriangle  
{  
    CDPoint            triangle_[3];  
    CDPoint            normal_;  
} TTriangle;  

typedef struct s_GridIntersection  
{  
    CDPoint         pos_;  
    float             distance_;  
    int                sensNormale_;  
    unsigned int    idTriangle_;  
} TGridIntersection;  

//-----------------------------------------------------------------------------  
//---------------------------------MUTEX---------------------------------------  
//-----------------------------------------------------------------------------  

void GetSemaphor(__global int * semaphor)  
{  
   int occupied = atomic_xchg(semaphor, 1);  
   while(occupied > 0)  
   {  
     occupied = atomic_xchg(semaphor, 1);  
   }  
}  

void ReleaseSemaphor(__global int * semaphor)  
{  
   int prevVal = atomic_xchg(semaphor, 0);  
}  

//-----------------------------------------------------------------------------  
//---------------------------------GEOMETRIE-----------------------------------  
//-----------------------------------------------------------------------------  

float dotProduct(const CDPoint* pA, const CDPoint* pB)  
{  
    return (pA->x * pB->x + pA->y * pB->y + pA->z * pB->z);  
}  

CDPoint crossProduct(const CDPoint* pA, const CDPoint* pB)  
{  
    CDPoint res;  

    res.x = pA->y * pB->z - pB->y * pA->z;  
    res.y = pA->z * pB->x - pB->z * pA->x;  
    res.z = pA->x * pB->y - pB->x * pA->y;  

    return res;  
}  

CDPoint soustraction(const CDPoint* pA, const CDPoint* pB)  
{  
    CDPoint res;  
    res.x = pA->x - pB->x;  
    res.y = pA->y - pB->y;  
    res.z = pA->z - pB->z;  
    return res;  
}  

CDPoint addition(const CDPoint* pA, const CDPoint* pB)  
{  
    CDPoint res;  
    res.x = pA->x + pB->x;  
    res.y = pA->y + pB->y;  
    res.z = pA->z + pB->z;  
    return res;  
}  

CDPoint homothetie(const CDPoint* pA, float val)  
{  
    CDPoint pnt;  
    pnt.x = pA->x * val;  
    pnt.y = pA->y * val;  
    pnt.z = pA->z * val;  
    return pnt;  
}  

//-----------------------------------------------------------------------------  
//---------------------------------KERNEL--------------------------------------  
//-----------------------------------------------------------------------------  

__kernel void IntersectionTriangle(    __global const TTriangle* pTriangleListe,  
                            const unsigned int idxDebutTriangle,  
                            const unsigned int idxFin,  
                            __constant const CDPoint* pPointOrigine,  
                            __constant const CDPoint* pDir,  
                            __global int *nbInter,  
                            __global TGridIntersection* pResults    )  
{  
    __private unsigned int index = get_global_id(0) + idxDebutTriangle;  

    if (index > idxFin) return;  

    __global const TTriangle *pTriangle = &pTriangleListe[index];  
    __private float distance = 0.f;  

    // Côté du triangle et normale au plan  
    __private CDPoint edge1 = soustraction(&pTriangle->triangle_[1], &pTriangle->triangle_[0]);  
    __private CDPoint edge2 = soustraction(&pTriangle->triangle_[2], &pTriangle->triangle_[0]);  
    __private CDPoint pvec = crossProduct(pDir, &edge2);            // produit vectoriel  

    // Le rayon et le triangle sont il parallèle ?  
    __private float det = dotProduct(&edge1, &pvec);  
    if (det == 0.f)  
    {  
        return ;  
    }  
    __private float inv_det = 1.f / det;  

    // Distance origin t0  
    __private CDPoint tvec = soustraction(pPointOrigine, &pTriangle->triangle_[0]);  

    //Calculate u parameter and test bound  
    __private float u = (dotProduct(&tvec, &pvec)) * inv_det;  
    //The intersection lies outside of the triangle  
    if (u < -MIN_LONGUEUR_RT  
        || u > 1.f + MIN_LONGUEUR_RT)  
    {  
        return ;  
    }  

    u = max(u, 0.f);  

    //Prepare to test v parameter  
    __private CDPoint qvec = crossProduct(&tvec, &edge1);  

    //Calculate V parameter and test bound  
    __private float v = dotProduct(pDir, &qvec) * inv_det;  

    //The intersection lies outside of the triangle  
    if (v < -MIN_LONGUEUR_RT  
        || u + v  > 1.f + MIN_LONGUEUR_RT)  
    {  
        return ;  
    }  

    // Get distance  
    distance = dotProduct(&edge2, &qvec) * inv_det;  

    if (distance > -MIN_LONGUEUR_RT)  
    {  
        // We are using nbInter as semaphor index  
        GetSemaphor(nbInter);  

        __private int idxInter = *nbInter;  

        pResults[idxInter].distance_ = max(distance, 0.f);  

        // Intersection point  
        __private CDPoint vDir = homothetie(pDir, distance);  
        pResults[idxInter].pos_ = addition(pPointOrigine, &vDir);  

        // Get ray way  
        pResults[idxInter].sensNormale_ = dotProduct(&pTriangle->normal_, pDir) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;  

        // Triangle id  
        pResults[idxInter].idTriangle_ = index - idxDebutTriangle;  

        // inc nb inter  
        *nbInter = *nbInter + 1;  

        ReleaseSemaphor(nbInter);  
    }  

}  

我注意到如果我改变&#34; __全局const TTriangle * pTriangleListe&#34; by&#34; const TTriangle * pTriangleListe&#34;它编译,但它不是我想要的代码!

我想要做的就是填充&#34; pTriangleListe&#34;中的所有三角形,并使用统一网格获取要检查的三角形索引(idxDebutTriangle / idxFin)。 &#34; pPointOrigine&#34;是射线起源和&#34; pDir&#34;方向。 &#34; nbInter&#34;和&#34; pResults&#34;共享并将包含交集(它们受信号保护)

这是我的openCL计算机配置:

Platform [0]
id      = 5339E7D8
profile     = FULL_PROFILE
version     = OpenCL 1.2 AMD-APP (1445.5)
name    = AMD Accelerated Parallel Processing
vendor  = Advanced Micro Devices, Inc.
extensions = cl_khr_icd
                cl_khr_d3d10_sharing
                cl_khr_d3d11_sharing
                cl_khr_dx9_media_sharing
                cl_amd_event_callback
                cl_amd_offline_devices
                cl_amd_hsa




2 Devices detected
    Device [0]
        id                          = 010DFA00
        type                        = CL_DEVICE_TYPE_GPU
        name                        = Cedar
        vendor                      = Advanced Micro Devices, Inc.
        driver version              = 1445.5 (VM)
        device version              = OpenCL 1.2 AMD-APP (1445.5)
        profile                     = FULL_PROFILE
        max compute units           = 2
        max work items dimensions   = 3
        max work item sizes         = 128 / 128 / 128
        max work group size         = 128
        max clock frequency         = 650 MHz
        address_bits                = 32
        max mem alloc size          = 512 MB
        global mem size             = 1024 MB
        image support               = CL_TRUE
        max read image args         = 128
        max write image args        = 8
        2D image max size           = 16384 x 16384
        3D image max size           = 2048 x 2048 x 2048
        max samplers                = 16
        max parameter size          = 1024
        mem base addr align         = 2048
        min data type align size    = 128
        single fp config            = CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
        global mem cache type       = CL_NONE
        max constant buffer size    = 64 KB
        max constant args           = 8
        local mem type              = CL_LOCAL
        local mem size              = 32 KB
        error correction support    = CL_FALSE
        profiling timer resolution  = 1 ns
        endian little               = CL_TRUE
        available                   = CL_TRUE
        compiler available          = CL_TRUE
        execution capabilities      = CL_EXEC_KERNEL
        queue properties            = CL_QUEUE_PROFILING_ENABLE
        extensions                  = cl_khr_global_int32_base_atomics
                                        cl_khr_global_int32_extended_atomics
                                        cl_khr_local_int32_base_atomics
                                        cl_khr_local_int32_extended_atomics
                                        cl_khr_3d_image_writes
                                        cl_khr_byte_addressable_store
                                        cl_khr_gl_sharing
                                        cl_ext_atomic_counters_32
                                        cl_amd_device_attribute_query
                                        cl_amd_vec3
                                        cl_amd_printf
                                        cl_amd_media_ops
                                        cl_amd_media_ops2
                                        cl_amd_popcnt
                                        cl_khr_d3d10_sharing
                                        cl_khr_d3d11_sharing
                                        cl_khr_dx9_media_sharing
                                        cl_amd_image2d_from_buffer_read_only
                                        cl_khr_spir
                                        cl_khr_gl_event


    Device [1]  
        id                          = 03501CD0  
        type                        = CL_DEVICE_TYPE_CPU    
        name                        =     Intel(R) Core(TM) i3-2130 CPU @ 3.40GHz   
        vendor                      = GenuineIntel  
        driver version              = 1445.5 (sse2,avx)     
        device version              = OpenCL 1.2 AMD-APP (1445.5)   
        profile                     = FULL_PROFILE  
        max compute units           = 4     
        max work items dimensions   = 3     
        max work item sizes         = 1024 / 1024 / 1024    
        max work group size         = 1024  
        max clock frequency         = 3392 MHz  
        address_bits                = 32    
        max mem alloc size          = 1024 MB   
        global mem size             = 2048 MB   
        image support               = CL_TRUE   
        max read image args         = 128   
        max write image args        = 8     
        2D image max size           = 8192 x 8192   
        3D image max size           = 2048 x 2048 x 2048    
        max samplers                = 16    
        max parameter size          = 4096  
        mem base addr align         = 1024  
        min data type align size    = 128   
        single fp config            = CL_FP_DENORM CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA    
        global mem cache type       = CL_READ_WRITE_CACHE   
        global mem cacheline size   = 64    
        global mem cache size       = 32768     
        max constant buffer size    = 64 KB     
        max constant args           = 8     
        local mem type              = CL_GLOBAL     
        local mem size              = 32 KB     
        error correction support    = CL_FALSE  
        profiling timer resolution  = 301 ns    
        endian little               = CL_TRUE   
        available                   = CL_TRUE   
        compiler available          = CL_TRUE   
        execution capabilities      = CL_EXEC_KERNEL CL_EXEC_NATIVE_KERNEL  
        queue properties            = CL_QUEUE_PROFILING_ENABLE     
        extensions                  = cl_khr_fp64   
                                        cl_amd_fp64     
                                        cl_khr_global_int32_base_atomics    
                                        cl_khr_global_int32_extended_atomics    
                                        cl_khr_local_int32_base_atomics     
                                        cl_khr_local_int32_extended_atomics     
                                        cl_khr_3d_image_writes  
                                        cl_khr_byte_addressable_store   
                                        cl_khr_gl_sharing   
                                        cl_ext_device_fission   
                                        cl_amd_device_attribute_query   
                                        cl_amd_vec3     
                                        cl_amd_printf   
                                        cl_amd_media_ops    
                                        cl_amd_media_ops2   
                                        cl_amd_popcnt   
                                        cl_khr_d3d10_sharing    
                                        cl_khr_spir     
                                        cl_amd_svm  
                                        cl_khr_gl_event

感谢您阅读!

0 个答案:

没有答案