我需要在opencl中解决稀疏矩阵向量乘法,但是当de kernel完成时我的时间非常慢。我假设时间慢是因为在内核中存在原子添加函数。
这是内核的代码:
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
void float_atomic_add(__global float *loc, const float f){
private float old = *loc;
private float sum = old + f;
while(atomic_cmpxchg((__global int*)loc, *((int*)&old), *((int*)&sum)) != *((int*)&old)){
old = *loc;
sum = old + f;
}
}
__kernel void forward(__global int* col, __global int* row, __global float* data, __global int* symmLOR_X, __global int* symm_Xpixel,__global int* symmLOR_Y, __global int* symm_Ypixel, __global int* symmLOR_XY,__global int* symm_XYpixel, __global int* symmLOR_Z, __global int* symm_Zpixel, __global float* x, __global float* b){
__private int i = get_global_id(0);
__private int p, pixel,lor, LOR_Y, LOR_X, LOR_XY;
__private int lor_z, pixel_z;
__private float v;
pixel = col[i]; // j
v = data[i];
lor= row[i];
//b[lor] += v * x[pixel];
float_atomic_add(&b[lor], v * x[pixel]);
LOR_X = symmLOR_X[lor];
p = symm_Xpixel[pixel];
//b[LOR_X] += v * x[p];
float_atomic_add(&b[LOR_X], v * x[p]);
LOR_Y = symmLOR_Y[lor];
p = symm_Ypixel[pixel];
//b[LOR_Y] += v * x[p];
float_atomic_add(&b[LOR_Y], v * x[p]);
LOR_XY = symmLOR_XY[lor];
p = symm_XYpixel[pixel];
//b[LOR_XY] += v * x[p];
float_atomic_add(&b[LOR_XY], v * x[p]);
// do Z symmetry.
lor_z = symmLOR_Z[lor];
pixel_z = symm_Zpixel[pixel];
//b[lor_z] += v * x[pixel_z];
float_atomic_add(&b[lor_z], v * x[pixel_z]);
LOR_X = symmLOR_X[lor_z];
p = symm_Xpixel[pixel_z];
//b[LOR_X] += v * x[p];
float_atomic_add(&b[LOR_X], v * x[p]);
LOR_Y = symmLOR_Y[lor_z];
p = symm_Ypixel[pixel_z];
//b[LOR_Y] += v * x[p];
float_atomic_add(&b[LOR_Y], v * x[p]);
LOR_XY = symmLOR_XY[lor_z];
p = symm_XYpixel[pixel_z];
//b[LOR_XY] += v * x[p];
float_atomic_add(&b[LOR_XY], v * x[p]);
}
为此,我使用COO格式的稀疏矩阵。