我从一个NVIDIA OpenCL示例中调整了附加的内核,并将性能与clblasSgemm进行了比较,发现它们的执行速度相同(至少在我的设置上)。我使用{16, 16}
本地工作规模启动它。
现在,假设矩阵A和B都是uchar
,而C则相应uint
。有没有办法优化乘法?只需更换类型降低性能。我尝试使用uchar4
和uchar16
进行手工矢量化,但这会让它变慢。
欢迎任何建议! (我是GPU编程和OpenCL的新手)
/*
* This software contains source code provided by NVIDIA Corporation.
*/
#define BLOCK_SIZE 16
__kernel void mat_mul(const __global float* A, const __global float* B,
__global float* C,
const int A_cols, const int B_cols) {
// Block index
const int bx = get_group_id(0);
const int by = get_group_id(1);
// Thread index
const int tx = get_local_id(0);
const int ty = get_local_id(1);
// Index of the first sub-matrix of A processed by the block
const int a0 = A_cols * BLOCK_SIZE * by;
// Index of the last sub-matrix of A processed by the block
const int a1 = a0 + A_cols - 1;
const int a_step = BLOCK_SIZE;
// Index of the first sub-matrix of B processed by the block
const int b0 = BLOCK_SIZE * bx;
// Step size used to iterate through the sub-matrices of B
const int b_step = BLOCK_SIZE * B_cols;
// Csub is used to store the element of the block sub-matrix
// that is computed by the thread
float Csub = 0;
__local float As[BLOCK_SIZE][BLOCK_SIZE];
__local float Bs[BLOCK_SIZE][BLOCK_SIZE];
// Loop over all the sub-matrices of A and B required to compute the
// block sub-matrix
for (int a=a0, b=b0; a<=a1; a+=a_step, b+=b_step) {
// Load the matrices from device memory to shared memory;
// each thread loads one element of each matrix
As[ty][tx] = A[a + A_cols * ty + tx];
Bs[ty][tx] = B[b + B_cols * ty + tx];
// Synchronize to make sure the matrices are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrices together;
// each thread computes one element of the block sub-matrix
#pragma unroll
for (int k=0; k<BLOCK_SIZE; ++k) {
Csub += As[ty][k] * Bs[k][tx];
}
// Synchronize to make sure that the preceding computation is done
// before loading two new sub-matrices of A and B in the next
// iteration
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write the block sub-matrix to device memory;
// each thread writes one element
C[get_global_id(1) * get_global_size(0) + get_global_id(0)] = Csub;
}
答案 0 :(得分:1)
有一种非常简单的方法来衡量你的内核是否良好。计算它的OPS&amp;带宽(每秒处理多少矩阵形式的数据)。然后将其与理论极限进行比较。你会得到因素,限制性能。通常,它是加载存储操作。