以下是我的代码的一部分,我更新了一些值并将其写入内存的另一部分
__kernel void update(
__global float4 *f,
__global float4 *p,
const unsigned int n,
__local float4 *sharedP){
float4 val = (float4)(0.f,0.f,0.f,0.f);
size_t lID = get_local_id(0);
size_t gID = get_global_id(0);
if(gID < n){
float4 p1 = p[gID];
for(size_t i = 0; i < n; ++i){
if(i!=gID){
float4 p2 = p1 - p[i];
//Some other calculations to finally update 'val'
}
}
f[gID] = val;
}
}
这里每个线程从全局内存读取n + 1次,但只回写一次。(我最初的想法是,瓶颈是由于重复调用全局内存,因此我做了一些更改来复制块数据进入共享内存,交换它和其他东西。即使这并没有提高性能很多)。但后来我发现瓶颈归因于f[gID] = val;
。例如,当内核与200
一起运行n = 8192
次时,需要3.5s
;但如果我发表评论f[gID] = val;
,则只需0.05s
。
其他方法的代码
__kernel void update(
__global float4 *f,
__global float4 *p,
const unsigned int n,
__local float4 *sharedP){
float4 val = (float4)(0.f,0.f,0.f,0.f);
size_t lID = get_local_id(0);
size_t gID = get_global_id(0);
if(gID < n){
sharedP[lID] = p[gID];
barrier(CLK_LOCAL_MEM_FENCE);
float4 p1 = sharedP[lID];
//To prevent i!=gID of other method
for(size_t i = 0; i < get_local_size(0);; ++i){
if(lID!=i){
float4 p2 = p1 - sharedP[i];
//Some other calculations to update 'val' at 0
}
}
for(size_t block = 1; block < (size_t)(get_global_size(0)/get_local_size(0)); ++block){
size_t idx = lID + (get_group_id(0) + block ) * get_local_size(0);
if( idx >= n)
idx -= n;
sharedP[lID] = p[idx];
barrier(CLK_LOCAL_MEM_FENCE);
for(size_t i = 0; i < get_local_size(0); ++i){
float4 p = p1 - sharedP[i];
//Some other calculations to update 'val'
}
f[gID] = val;
}
}
}
那么写入内存会导致它如此减慢以及为什么使用共享内存不会显着提高性能呢?