比较多个矩阵乘法计算与pyopencl和pycuda显示性能差异。
系统:
Ubuntu 14.04 with GeForce 920m
Pyopencl代码:
#-*- coding: utf-8 -*-
import pyopencl as cl
import pyopencl.array
from jinja2 import Template
import time
import numpy as np
from scipy.sparse import csr_matrix
KERNEL = Template("""
{{header}}
#include <pyopencl-complex.h>
__kernel
void complex_mat_mul(__global const {{complex_type}} *a, __global const {{complex_type}} *b, __global {{complex_type}} *res)
{
int row = get_local_id(1);
int col = get_local_id(0);
int mat_id = get_group_id(0) * get_num_groups(0) + get_group_id(1);
//printf("mat_id: %d, row: %d, col: %d ----- ", mat_id, row, col);
{{complex_type}} entry = 0;
for (int e = 0; e < {{mat_dim}}; ++e) {
entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] * b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
}
res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;
}
""")
def get_ctx_queue(devices=[0]):
"""
optain context and queue for spcified devices
"""
platform = cl.get_platforms()[0]
platform_devices = platform.get_devices()
ctx = cl.Context(devices=[platform_devices[x] for x in devices])
return (ctx, cl.CommandQueue(ctx))
data_types = {
'cfloat_t': np.complex64,
'cdouble_t': np.complex128,
'float': np.float32,
'double': np.float64
}
def render_kernel(complex_type, real_type, mat_dim):
header = ""
if data_types[complex_type] == np.complex128:
header = """
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define PYOPENCL_DEFINE_CDOUBLE
"""
templ = KERNEL.render(
header=header,
complex_type=complex_type,
real_type=real_type,
mat_dim=mat_dim,
)
print(templ)
return templ
complex_type = 'cdouble_t'
real_type = 'float'
mat_dim = 25
mats_count = 200 # x*x
ctx, queue = get_ctx_queue()
program= cl.Program(ctx, render_kernel(complex_type, real_type, mat_dim)).build()
mats_1 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
mats_2 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
start = time.time()
numpy_result = np.array([np.dot(mats_1[i], mats_2[i]) for i in range(mats_count**2)])
print("numpy time: %.3f" % (time.time()-start))
a = cl.array.to_device(queue, mats_1)
b = cl.array.to_device(queue, mats_2)
c = cl.array.to_device(queue, np.zeros((mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type]))
start = time.time()
program.complex_mat_mul(queue, (mats_count*mat_dim, mats_count*mat_dim, 1), (mat_dim, mat_dim, 1), a.data,b.data,c.data)
queue.finish()
queue.flush()
result = c.get()
print("opencl time: %.3f" % (time.time()-start))
assert np.allclose(numpy_result.flatten(), result.flatten(), atol=0), "FAIL opencl"
print("Success")
Pycuda代码:
#-*- coding: utf-8 -*-
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
from jinja2 import Template
import time
import numpy as np
from scipy.sparse import csr_matrix
KERNEL = Template("""
#include <stdio.h>
#include <pycuda-complex.hpp>
typedef pycuda::complex<float> scmplx;
typedef pycuda::complex<double> dcmplx;
__global__ void complex_mat_mul(const {{complex_type}} *a, const {{complex_type}} *b, {{complex_type}} *res)
{
int row = threadIdx.y;
int col = threadIdx.x;
int mat_id = blockIdx.x * gridDim.x + blockIdx.y;
//printf("mat_id: %d, row: %d, col: %d ----- ", mat_id, row, col);
{{complex_type}} entry = 0;
for (int e = 0; e < {{mat_dim}}; ++e) {
entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] * b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
}
res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;
}
""")
data_types = {
'scmplx': np.complex64,
'dcmplx': np.complex128,
'float': np.float32,
'double': np.float64
}
def render_kernel(complex_type, real_type, mat_dim, block, gird):
templ = KERNEL.render(
complex_type=complex_type,
real_type=real_type,
mat_dim=mat_dim,
blockDim_x=block[0],
blockDim_y=block[1]
)
print(templ)
return templ
complex_type = 'dcmplx'
real_type = 'double'
mat_dim = 25
mats_count = 200 # x*x
block = (mat_dim,mat_dim,1)
grid = (mats_count,mats_count)
program = SourceModule(render_kernel(complex_type, real_type, mat_dim, block, grid))
complex_mat_mul = program.get_function("complex_mat_mul")
mats_1 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
mats_2 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
result = np.zeros((mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
start = time.time()
numpy_result = np.array([np.dot(mats_1[i], mats_2[i]) for i in range(mats_count**2)])
print("numpy time: %.3f" % (time.time()-start))
a = drv.In(mats_1)
b = drv.In(mats_2)
c = drv.Out(result)
start = time.time()
complex_mat_mul(a, b, c,
block=block,
grid=grid
)
print("cuda time: %.3f" % (time.time()-start))
assert np.array_equal(numpy_result.flatten(), result.flatten()), "FAIL"
print("Success")
在单精度和双精度pyopencl执行速度至少快2倍。更改矩阵数不会改变结果。
两个内核的调用次数相同,我认为基准测试的位置是合适的。
我错过了什么?
答案 0 :(得分:2)
运行更多测试会得到以下结果。
使用内核(除了确定row,col和mat_id之外,opencl是等效的):
int row = threadIdx.y;
int col = threadIdx.x;
int mat_id = blockIdx.x * gridDim.x + blockIdx.y;
for (int i=0; i < 10; i++) {
{{complex_type}} entry = 0;
for (int e = 0; e < {{mat_dim}}; ++e) {
entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] * b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
}
res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;
}
正如在讨论的评论中,当我包含初始缓冲操作时,我很快就会有pycuda的结果。但是通过添加额外的循环来增加浮点运算,pycuda头开始消失了。 因此,我们希望pycuda表现得更好。