我将用于C ++的光线跟踪算法从scratchapixel.com site转换为opencl java(编译为gpu)形式,并且它不适用于等于或大于7的深度值。当我将深度设置为7(或更高)时,编译器告诉:
Error:E013:Insufficient Private Resources!
因为半递归函数,每个函数都获得32位私有寄存器(60+)的另一个副本。 我需要这些寄存器溢出到主内存,这样我就可以设置更高的深度值而不会将内核变成迭代版本。是否有可能让一个坏主意泄漏?如果没有,我该如何启用它?
如果我将球体的数量设置为大于100ish值的值,编译器会告诉:
Frontend phase failed compilation.
Error: Creating kernel rayTraceSphereRender failed!
但是总缓冲区数和缓冲区长度是常量的,直到sphereNumber = 4096,因此它只是将“for loop bound”从100改为110并且编译器给出了该错误。我怀疑它会自动展开循环,这会导致更大的注册压力。这导致了更高的私人登记需求(并且需要再次泄漏)
以下是主机构建内核字符串的方式:
kernelx+=
"float4 trace"+(depth+1)+"(float4 *rayorig, float4 *raydir,__global sphereObject *spheres, int sphereNum, int depth, int threadNo ){return (float4)(0,0,0,0);}";
for(int iteration=depth;iteration>=0;iteration--)
{
//here is the bloating action to make traceX(...) function semi-recursive.
kernelx+=
"float4 trace"+iteration+"(float4 *rayorig, float4 *raydir,__global sphereObject *spheres, int sphereNum, int depth, int threadNo )"
+ "{"
+ " int MAX_RAY_DEPTH="+depth+";"
+ "float facingratio =0; float fresneleffect=0;"
+ "float4 refldir=(float4)(0,0,0,0);"
+ "float4 arg00=(float4)(0,0,0,0); float4 reflection=(float4)(0,0,0,0);float4 refraction=(float4)(0,0,0,0);"
+ "float4 refrdir=(float4)(0,0,0,0); float4 arg01=(float4)(0,0,0,0); "
+ "float4 surfaceColor=(float4)(0,0,0,0);"
+ "float4 phit=(float4)(0,0,0,0); "
+ "float4 nhit=(float4)(0,0,0,0);"
+ "float bias=0.000f; "
+ "float4 traceTmpReturn=(float4)(0,0,0,0);"
+ "float tnear=100000000.0f; "
+ "sphereObject so;"
+ " initElem(&so); "
+ " for(int i=0;i<sphereNum;i++)"
+ " {"
+ " float t0=100000000.0f;float t1=100000000.0f;"
+ " if(intersect(rayorig[0],raydir[0],&t0,&t1,spheres[i].center,spheres[i].radius)==1)"
+ " {"
+ " if(t0<0){t0=t1;}"
+ " if(t0<tnear)"
+ " {"
+ " tnear=t0;"
//+ " so=&spheres[i];"
+ " copyElem(&so,spheres[i]);"
+ " }"
+ " }"
+ " }"
+ " "
+ " if(so.radius<-0.5f){ return (float4)(0,0,0,0);} "
+ ""
+ " surfaceColor=(float4)(0,0,0,0);"
+ " phit=rayorig[0]+raydir[0]*tnear;"
+ " nhit=phit-so.center;"
+ " nhit=normalize(nhit);"
+ " bias=0.001f;"
+ " bool inside=false;"
+ " if(dot3X(raydir[0],nhit)>0){nhit=-nhit; inside=true;}"
+ " if(((so.transparency > 0) || (so.reflection > 0))&& (depth<MAX_RAY_DEPTH) )"
+ " {"
+ " facingratio = -dot3X(raydir[0],nhit);"
+ " fresneleffect= mixx(pow(1.0f-facingratio,3),1.0f,0.1f);"
+ " refldir = raydir[0] - nhit*2.0f*dot3X(raydir[0],nhit);"
+ " refldir=normalize(refldir);"
+ " arg00=phit+nhit*bias;"
+ " reflection=trace"+(iteration+1)+"(&arg00,&refldir,spheres,sphereNum,depth+1,threadNo);"
+ " refraction=(float4)(0,0,0,0);"
+ " if(so.transparency>0)"
+ " {"
+ " float ior=1.1f; float eta=(inside)?ior:1/ior; "
+ " float cosi=-dot3X(nhit,raydir[0]);"
+ " float k=1.0f-eta*eta*(1.0f-cosi*cosi);"
+ " refrdir = raydir[0]*eta+nhit*(eta*cosi-sqrt(k));"
+ " refrdir=normalize(refrdir);"
+ " arg01=phit-nhit*bias;"
+ " refraction=trace"+(iteration+1)+"(&arg01,&refrdir, spheres,sphereNum, depth+1, threadNo);"
+ " "
+ " }"
+ " surfaceColor=(reflection*fresneleffect+refraction*(1.0f-fresneleffect)*so.transparency)*so.surfaceColor;"
+ ""
+ " }"
+ " else"
+ " {"
+ " for(int i=0;i<sphereNum;i++)"
+ " {"
+ " if(spheres[i].emissionColor.x>0)"
+ " {"
+ " float4 transmission=(float4)(1,1,1,1);"
+ " float4 lightDirection=spheres[i].center-phit;"
+ " lightDirection=normalize(lightDirection);"
+ " for(int j=0;j<sphereNum;j++)"
+ " {"
+ " if(i!=j)"
+ " {"
+ " float t0,t1;"
+ " float4 arg02=phit+nhit*bias;"
+ " if(intersect(arg02,lightDirection,&t0,&t1,spheres[j].center,spheres[j].radius)==1)"
+ " {"
+ " transmission= (float4)(0,0,0,0);break;"
+ " } "
+ " } "
+ " }"
+ " surfaceColor += so.surfaceColor*transmission*max(0.0f, dot3X(nhit,lightDirection))*spheres[i].emissionColor;"
+ " }"
+ " }"
+ " }"
+ ""
+ " return surfaceColor+so.emissionColor; "
+ "}";
}
kernelx+= "__kernel void rayTraceSphereRender(__global float4 *center, __global float *radius,"
+ " __global float4 *surfaceColor, __global float4 *emissionColor,"
+ " __global float *transparency,"
+ " __global float *reflection, __global float4 *image,"
+ " __global sphereObject *spheres)"+
"{"+
" int gid=get_global_id(0);" +
" int lid=get_local_id(0);"
+ " "
+ " {"
+ " int numSphr="+raytraceSphereNumber0+";"
+ " int width="+n+", height="+n+";"
+ " float invWidth = 1.0f/((float)width);"
+ " float invHeight= 1.0f/((float)height);"
+ " float fov = 30.0f; float aspectratio= ((float)width)/((float)height);"
+ " float angle=tan(3.141592653589793f*0.5f*fov/(180.0f));"
+ " int y=gid/"+n+";"
+ " "
+ " int x=gid%"+n+";"
+ " "
+ " float xx=(2.0f*((x-0.5f)*invWidth)-1.0f)*angle*aspectratio;"
+ " float yy=(1.0f-2.0f*((y-0.5f)*invHeight))*angle;"
+ " float4 raydir=(float4)(xx,yy,-1.0f,0.0f);"
+ " raydir=normalize(raydir);"
+ " float4 ref=(float4)(0,0,0,0);"
+ " float4 upp=(float4)(0,0,0,0);"
+ " upp=trace0(&ref,&raydir,spheres,numSphr,0,gid);"
+ " upp.w=1.0;"
+ " if(x>(width-width/"+sayi2+")) image[y+x*"+n+"]=upp;"
+ " "
+ " "
+ " "
+ " }"
+ " "
//
+"}";
主机:FX8150 windows7-64bit家庭高级版,Java-64bit(Eclipse)
装置:HD7870催化剂13.12
代码适用于GPU的较低深度和球体数量;对于我设置的任何深度和球体数量的CPU都没有问题: