如何在OpenCL内核中实现这些?是'回归'相当于'break;'?
我使用的是openCL 1.2
我想用3个嵌套的for循环来实现它,循环遍历嵌套数组的typedef结构。
修改
意识到我需要展示一些代码以便更好地理解
IN KERNEL ......
typedef struct tag_sfextras
{
float *high;
float *low;
}sfextras;
typedef struct tag_sdirection
{
int time;
float result;
sfextras *fextras;
}sdirection;
__kernel void Call(sdirection *_direction,
int _index,
int _start,
int _stop,
__global float *_result)
{
float _sum = 0.0f;
if (_index > 1)
{
_result[0] = 0.0f;
int i = get_global_id(0);
if (_direction[i].time >= _stop)
{
break;//or return?...
}
if (_direction[i].time < _start)
{
continue;// what to put here?...
}
else
{
_start = _direction[i].time + (1440 * 60);
}
int d = get_global_id(1);
int f = get_global_id(2);
float _fextras_weight = 0.0f;// need to zeroize on each inner loop (for f)
_fextras_weight += (float)pow(_direction[_index - 1].fextras[d].high[f] - _direction[i].fextras[d].high[f], 2.0f);
_fextras_weight += (float)pow(_direction[_index - 1].fextras[d].low[f] - _direction[i].fextras[d].low[f], 2.0f);
_result[0] += _fextras_weight*_direction[i].result;
_sum += _fextras_weight;
}
if (_sum > 0.0f)
{
_result[0] /= _sum;
}
}
IN HOST(我试图在内核中复制以提高效率的代码)
if(_direction_index > 1)
{
_fextras = 0.0f;
for(int i=0;i<_direction_index-1;i++)
{
if(_direction[i].time >= _stop)
{
break;
}
if(_direction[i].time < _start)
{
continue;
}
else
{
_direction_start = _direction[i].time + (1440*60);
}
for(int d=0;d<_DIRECTION;d++)
{
for(int f=0;f<_FEXTRAS;f++)
{
float _fextras_weight = 0.0f;
_fextras_weight += (float)pow(_direction[_direction_index-1].fextras[d].high[f]-_direction[i].fextras[d].high[f],2.0f);
_fextras_weight += (float)pow(_direction[_direction_index-1].fextras[d].low[f]-_direction[i].fextras[d].low[f],2.0f);
_fextras += _fextras_weight*_direction[i].result;
_sum += _fextras_weight;
}
}
}
if(_sum > 0.0f)
{
_fextras /= _sum;
}
}
答案 0 :(得分:2)
取消opencl的所有其他线程会使它们成为未定义的行为,因为它们中的许多可能处于写入/读取全局/本地内存的中间,并且还可能清除正在运行的线程(停止所有其他/剩余的内核/线程)。可能这就是为什么opencl中没有这样的东西。
但是你可以添加一个输出数组,每个线程写入它的最后一个状态。如果一个元素有&#34;返回&#34;代码,你应该检查&#34; after_return&#34;代码,以省略这些结果&#39;计算结果并接受&#34; before_return&#34;那些。这也需要在输出阶段进行原子操作,因此变慢,这是不好的。
但你可以安全地从单个内核返回:
下面的代码编译得很好并且早期退出(某些线程的结束内核执行但不是全部)由&#39; return&#39;在HD7870和R7-240上没有错误,因为&#39;返回&#39;不是OpenCL应用的约束之一。
__kernel void rarToVideo(__global int * p,__global char * c)
{
...
if (tmp)
{
foo=1;
}
else
{
return;
}
...
}
使用了c ++的opencl 1.2标头。
但是,如果你仍然需要假回报并且线程不会影响其他线程&#39;输出/输入,然后这样的东西会有所帮助:
// beginning phase of this thread
if(globalAtomicElement[0]>=RETURNED)
{
// finished this thread so it doesn't waste ALU/LD-ST/....
// leaves room for other wavefronts at least
outputState[threadId]=NOT_STARTED;
return;
}
...
...
// ending phase of this thread
// localState has information if this thread needed a "return"
// 0=NOT_RETURNED
// 1=RETURNED
// 2=NOT_STARTED
lastResult=atomic_add(globalAtomicElement,localState);
if(lastResult>=RETURNED)
{
outputState[threadId]=AFTER_RETURNED; // you ommit
// this thread's result
// because an other thread
// pretends to stop all
// so this thread wasted cycles but dont worry,
// it would always waste even if you don't use
// a core for GCN 1.0 - GCN 3.0 architectures
// a core always spin within a compute unit if a
// core/shader is working on something.
// polaris architecture will have ability
// to shut down unused cores so that will not be
// a problem of power consumption either.
}
else if(lastResult==NOT_RETURNED && thisThreadReturned)
{
outputState[threadId]=RETURNED; // this is returning
// thread
//(finishing,pretending to stop all)
}
else if(lastResult==NOT_RETURNED && !thisThreadReturned)
{
outputState[threadId]=BEFORE_RETURNED; // you accept this thread's
// results because no thread
// has ever stopped and this
// thread surely computed
//everything before that
}
然后在主机端,您只检查/过滤&#34; BEFORE_RETURNED&#34;和&#34;返回&#34;并消除&#34; AFTER_RETURNED&#34;。
的结果在opencl 2.0中,你可以试试这个:
这可以至少节省一半线程(或1/4或1/8 ......或1 / N),但由于只有2个线程效率低,所以会很慢。