如何在opencl中避免浮动原子添加

时间:2015-04-27 21:10:44

标签: c++ parallel-processing opencl gpgpu

我需要在opencl中解决稀疏矩阵向量乘法,但是当de kernel完成时我的时间非常慢。我假设时间慢是因为在内核中存在原子添加函数。

这是内核的代码:

#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable

void float_atomic_add(__global float *loc, const float f){
private float old = *loc;
private float sum = old + f;
while(atomic_cmpxchg((__global int*)loc, *((int*)&old), *((int*)&sum)) !=              *((int*)&old)){
    old = *loc;
    sum = old + f;
}
}



 __kernel void forward(__global int* col, __global int* row, __global float* data, __global int* symmLOR_X, __global int* symm_Xpixel,__global int*    symmLOR_Y, __global int* symm_Ypixel, __global int* symmLOR_XY,__global int*           symm_XYpixel, __global int* symmLOR_Z, __global int* symm_Zpixel, __global float* x, __global float* b){

  __private int i = get_global_id(0);
  __private int p, pixel,lor, LOR_Y, LOR_X, LOR_XY;
  __private int lor_z, pixel_z;
  __private float v;



  pixel = col[i]; // j
  v = data[i];
  lor= row[i];
  //b[lor] += v * x[pixel];
  float_atomic_add(&b[lor], v * x[pixel]);


  LOR_X = symmLOR_X[lor];
  p = symm_Xpixel[pixel];
  //b[LOR_X] += v * x[p];
  float_atomic_add(&b[LOR_X], v * x[p]);

  LOR_Y = symmLOR_Y[lor];
  p = symm_Ypixel[pixel];
  //b[LOR_Y] += v * x[p];
  float_atomic_add(&b[LOR_Y], v * x[p]);

  LOR_XY = symmLOR_XY[lor];
  p = symm_XYpixel[pixel];
  //b[LOR_XY] += v * x[p];
  float_atomic_add(&b[LOR_XY], v * x[p]);

  // do Z symmetry.
  lor_z = symmLOR_Z[lor];
  pixel_z = symm_Zpixel[pixel];
  //b[lor_z] += v * x[pixel_z];
  float_atomic_add(&b[lor_z], v * x[pixel_z]);

  LOR_X = symmLOR_X[lor_z];
  p = symm_Xpixel[pixel_z];
  //b[LOR_X] += v * x[p];
  float_atomic_add(&b[LOR_X], v * x[p]);

  LOR_Y = symmLOR_Y[lor_z];
  p = symm_Ypixel[pixel_z];
  //b[LOR_Y] += v * x[p];
  float_atomic_add(&b[LOR_Y], v * x[p]);

  LOR_XY = symmLOR_XY[lor_z];
  p = symm_XYpixel[pixel_z];
  //b[LOR_XY] += v * x[p];
  float_atomic_add(&b[LOR_XY], v * x[p]);

  }

为此,我使用COO格式的稀疏矩阵。

0 个答案:

没有答案