这是一个简单的OpenCL矩阵乘法内核,它让我发疯:
顺便说一句,我使用的是pyopencl。
__kernel void matrixMul( __global int* C,
__global int* A,
__global int* B,
int wA, int wB){
int row = get_global_id(1); //2D Threas ID x
int col = get_global_id(0); //2D Threas ID y
//Perform dot-product accumulated into value
int value = 0;
for ( int k = 0; k < wA; k++ ){
value += A[row*wA + k] * B[k*wB+col];
}
C[row*wA+col] = value; //Write to the device memory
}
其中(输入)
A = [72 45
75 61]
B = [26 53
46 76]
wA = wB = 2
我得到的输出:
有时我得到:
C = [3942 0
0 5472]
我得到了:
C = [3942 7236
3312 5472]
但输出应该是:
C = [3942 7236
4756 8611]
我不知道我在这里犯了什么错误。我整整一天都没有运气。
请帮我解决这个问题
这是完整的python代码:
import pyopencl as cl
import numpy as np
import os
ORDER = 2
LEN = ORDER*ORDER
ctx = cl.create_some_context()
commandQueue = cl.CommandQueue( ctx )
A = np.array((72, 45, 75, 61), dtype = np.int32)
B = np.array((26, 53, 46, 76), dtype = np.int32)
C = np.empty_like(A)
in_buf1 = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
hostbuf = A )
in_buf2 = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
hostbuf = B )
out_buf = cl.Buffer( ctx, cl.mem_flags.WRITE_ONLY, C.nbytes )
kernelSrc1 = """__kernel void
matrixMul( /*const int Mdim,
const int Ndim,
const int Pdim,*/
__global int* C,
__global int* A,
__global int* B,
int wA, int wB)
{
int row = get_global_id(1); //2D Threas ID x
int col = get_global_id(0); //2D Threas ID y
//Perform dot-product accumulated into value
int value = 0;
for ( int k = 0; k < wA; k++ ){
value += A[row*wA + k] * B[k*wB+col];
}
C[row*wA+col] = value; //Write to the device memory
}"""
program1 = cl.Program(ctx, kernelSrc1 ).build()
event1 = program1.matrixMul( commandQueue, (LEN, ), None,
out_buf, in_buf1, in_buf2, np.int32(ORDER), np.int32(ORDER));
event1.wait()
cl.enqueue_copy(commandQueue, C, out_buf)
print C
我正在使用Python 2.7.x,pyopencl 2012.1,AMD APP SDK
答案 0 :(得分:6)
您正在错误地设置全局大小参数。由于您在内核中使用全局大小的两个维度,因此需要将全局大小设置为(ORDER,ORDER)。当你改变它时,你得到:
[3942 7236
4756 8611]