我正在研究一个内核,它可以找到光线和三角形列表之间的交叉点,但是(总有一个"但是")我在使用opencl编译器时遇到了一些问题,当我尝试编译时它真的崩溃了它
我尝试在我的CPU编译器上编译它并且编译得很好,但是使用我的GPU编译器它会崩溃......
//-----------------------------------------------------------------------------
//---------------------------------DEFINES-------------------------------------
//-----------------------------------------------------------------------------
#define RAYON_SORTANT -1000
#define RAYON_ENTRANT 1000
#define MIN_LONGUEUR_RT 1.E-6f
//-----------------------------------------------------------------------------
//---------------------------------CONTENT-------------------------------------
//-----------------------------------------------------------------------------
typedef struct s_CDPoint
{
float x;
float y;
float z;
} CDPoint;
typedef struct s_TTriangle
{
CDPoint triangle_[3];
CDPoint normal_;
} TTriangle;
typedef struct s_GridIntersection
{
CDPoint pos_;
float distance_;
int sensNormale_;
unsigned int idTriangle_;
} TGridIntersection;
//-----------------------------------------------------------------------------
//---------------------------------MUTEX---------------------------------------
//-----------------------------------------------------------------------------
void GetSemaphor(__global int * semaphor)
{
int occupied = atomic_xchg(semaphor, 1);
while(occupied > 0)
{
occupied = atomic_xchg(semaphor, 1);
}
}
void ReleaseSemaphor(__global int * semaphor)
{
int prevVal = atomic_xchg(semaphor, 0);
}
//-----------------------------------------------------------------------------
//---------------------------------GEOMETRIE-----------------------------------
//-----------------------------------------------------------------------------
float dotProduct(const CDPoint* pA, const CDPoint* pB)
{
return (pA->x * pB->x + pA->y * pB->y + pA->z * pB->z);
}
CDPoint crossProduct(const CDPoint* pA, const CDPoint* pB)
{
CDPoint res;
res.x = pA->y * pB->z - pB->y * pA->z;
res.y = pA->z * pB->x - pB->z * pA->x;
res.z = pA->x * pB->y - pB->x * pA->y;
return res;
}
CDPoint soustraction(const CDPoint* pA, const CDPoint* pB)
{
CDPoint res;
res.x = pA->x - pB->x;
res.y = pA->y - pB->y;
res.z = pA->z - pB->z;
return res;
}
CDPoint addition(const CDPoint* pA, const CDPoint* pB)
{
CDPoint res;
res.x = pA->x + pB->x;
res.y = pA->y + pB->y;
res.z = pA->z + pB->z;
return res;
}
CDPoint homothetie(const CDPoint* pA, float val)
{
CDPoint pnt;
pnt.x = pA->x * val;
pnt.y = pA->y * val;
pnt.z = pA->z * val;
return pnt;
}
//-----------------------------------------------------------------------------
//---------------------------------KERNEL--------------------------------------
//-----------------------------------------------------------------------------
__kernel void IntersectionTriangle( __global const TTriangle* pTriangleListe,
const unsigned int idxDebutTriangle,
const unsigned int idxFin,
__constant const CDPoint* pPointOrigine,
__constant const CDPoint* pDir,
__global int *nbInter,
__global TGridIntersection* pResults )
{
__private unsigned int index = get_global_id(0) + idxDebutTriangle;
if (index > idxFin) return;
__global const TTriangle *pTriangle = &pTriangleListe[index];
__private float distance = 0.f;
// Côté du triangle et normale au plan
__private CDPoint edge1 = soustraction(&pTriangle->triangle_[1], &pTriangle->triangle_[0]);
__private CDPoint edge2 = soustraction(&pTriangle->triangle_[2], &pTriangle->triangle_[0]);
__private CDPoint pvec = crossProduct(pDir, &edge2); // produit vectoriel
// Le rayon et le triangle sont il parallèle ?
__private float det = dotProduct(&edge1, &pvec);
if (det == 0.f)
{
return ;
}
__private float inv_det = 1.f / det;
// Distance origin t0
__private CDPoint tvec = soustraction(pPointOrigine, &pTriangle->triangle_[0]);
//Calculate u parameter and test bound
__private float u = (dotProduct(&tvec, &pvec)) * inv_det;
//The intersection lies outside of the triangle
if (u < -MIN_LONGUEUR_RT
|| u > 1.f + MIN_LONGUEUR_RT)
{
return ;
}
u = max(u, 0.f);
//Prepare to test v parameter
__private CDPoint qvec = crossProduct(&tvec, &edge1);
//Calculate V parameter and test bound
__private float v = dotProduct(pDir, &qvec) * inv_det;
//The intersection lies outside of the triangle
if (v < -MIN_LONGUEUR_RT
|| u + v > 1.f + MIN_LONGUEUR_RT)
{
return ;
}
// Get distance
distance = dotProduct(&edge2, &qvec) * inv_det;
if (distance > -MIN_LONGUEUR_RT)
{
// We are using nbInter as semaphor index
GetSemaphor(nbInter);
__private int idxInter = *nbInter;
pResults[idxInter].distance_ = max(distance, 0.f);
// Intersection point
__private CDPoint vDir = homothetie(pDir, distance);
pResults[idxInter].pos_ = addition(pPointOrigine, &vDir);
// Get ray way
pResults[idxInter].sensNormale_ = dotProduct(&pTriangle->normal_, pDir) > 0.f ? RAYON_SORTANT : RAYON_ENTRANT;
// Triangle id
pResults[idxInter].idTriangle_ = index - idxDebutTriangle;
// inc nb inter
*nbInter = *nbInter + 1;
ReleaseSemaphor(nbInter);
}
}
我注意到如果我改变&#34; __全局const TTriangle * pTriangleListe&#34; by&#34; const TTriangle * pTriangleListe&#34;它编译,但它不是我想要的代码!
我想要做的就是填充&#34; pTriangleListe&#34;中的所有三角形,并使用统一网格获取要检查的三角形索引(idxDebutTriangle / idxFin)。 &#34; pPointOrigine&#34;是射线起源和&#34; pDir&#34;方向。 &#34; nbInter&#34;和&#34; pResults&#34;共享并将包含交集(它们受信号保护)
这是我的openCL计算机配置:
Platform [0]
id = 5339E7D8
profile = FULL_PROFILE
version = OpenCL 1.2 AMD-APP (1445.5)
name = AMD Accelerated Parallel Processing
vendor = Advanced Micro Devices, Inc.
extensions = cl_khr_icd
cl_khr_d3d10_sharing
cl_khr_d3d11_sharing
cl_khr_dx9_media_sharing
cl_amd_event_callback
cl_amd_offline_devices
cl_amd_hsa
2 Devices detected
Device [0]
id = 010DFA00
type = CL_DEVICE_TYPE_GPU
name = Cedar
vendor = Advanced Micro Devices, Inc.
driver version = 1445.5 (VM)
device version = OpenCL 1.2 AMD-APP (1445.5)
profile = FULL_PROFILE
max compute units = 2
max work items dimensions = 3
max work item sizes = 128 / 128 / 128
max work group size = 128
max clock frequency = 650 MHz
address_bits = 32
max mem alloc size = 512 MB
global mem size = 1024 MB
image support = CL_TRUE
max read image args = 128
max write image args = 8
2D image max size = 16384 x 16384
3D image max size = 2048 x 2048 x 2048
max samplers = 16
max parameter size = 1024
mem base addr align = 2048
min data type align size = 128
single fp config = CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
global mem cache type = CL_NONE
max constant buffer size = 64 KB
max constant args = 8
local mem type = CL_LOCAL
local mem size = 32 KB
error correction support = CL_FALSE
profiling timer resolution = 1 ns
endian little = CL_TRUE
available = CL_TRUE
compiler available = CL_TRUE
execution capabilities = CL_EXEC_KERNEL
queue properties = CL_QUEUE_PROFILING_ENABLE
extensions = cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_3d_image_writes
cl_khr_byte_addressable_store
cl_khr_gl_sharing
cl_ext_atomic_counters_32
cl_amd_device_attribute_query
cl_amd_vec3
cl_amd_printf
cl_amd_media_ops
cl_amd_media_ops2
cl_amd_popcnt
cl_khr_d3d10_sharing
cl_khr_d3d11_sharing
cl_khr_dx9_media_sharing
cl_amd_image2d_from_buffer_read_only
cl_khr_spir
cl_khr_gl_event
Device [1]
id = 03501CD0
type = CL_DEVICE_TYPE_CPU
name = Intel(R) Core(TM) i3-2130 CPU @ 3.40GHz
vendor = GenuineIntel
driver version = 1445.5 (sse2,avx)
device version = OpenCL 1.2 AMD-APP (1445.5)
profile = FULL_PROFILE
max compute units = 4
max work items dimensions = 3
max work item sizes = 1024 / 1024 / 1024
max work group size = 1024
max clock frequency = 3392 MHz
address_bits = 32
max mem alloc size = 1024 MB
global mem size = 2048 MB
image support = CL_TRUE
max read image args = 128
max write image args = 8
2D image max size = 8192 x 8192
3D image max size = 2048 x 2048 x 2048
max samplers = 16
max parameter size = 4096
mem base addr align = 1024
min data type align size = 128
single fp config = CL_FP_DENORM CL_FP_INF_NAN CL_FP_ROUND_TO_NEAREST CL_FP_ROUND_TO_ZERO CL_FP_ROUND_TO_INF CL_FP_FMA
global mem cache type = CL_READ_WRITE_CACHE
global mem cacheline size = 64
global mem cache size = 32768
max constant buffer size = 64 KB
max constant args = 8
local mem type = CL_GLOBAL
local mem size = 32 KB
error correction support = CL_FALSE
profiling timer resolution = 301 ns
endian little = CL_TRUE
available = CL_TRUE
compiler available = CL_TRUE
execution capabilities = CL_EXEC_KERNEL CL_EXEC_NATIVE_KERNEL
queue properties = CL_QUEUE_PROFILING_ENABLE
extensions = cl_khr_fp64
cl_amd_fp64
cl_khr_global_int32_base_atomics
cl_khr_global_int32_extended_atomics
cl_khr_local_int32_base_atomics
cl_khr_local_int32_extended_atomics
cl_khr_3d_image_writes
cl_khr_byte_addressable_store
cl_khr_gl_sharing
cl_ext_device_fission
cl_amd_device_attribute_query
cl_amd_vec3
cl_amd_printf
cl_amd_media_ops
cl_amd_media_ops2
cl_amd_popcnt
cl_khr_d3d10_sharing
cl_khr_spir
cl_amd_svm
cl_khr_gl_event
感谢您阅读!