Question

我正在研究使用离散拉普拉斯算子的FDTD程序，可以将其实现为卷积运算。 From what I have read是PyTorch的主要组件，它是一个张量库，经过优化可以执行机器学习中常用的操作（例如卷积）。我有兴趣将其与我使用过的其他框架进行比较，因此我编写了一个测试程序，以将离散Laplacian多次应用于一维数组并比较执行时间：

import torch as tr
import time
from numba import jit, cuda
import numpy as np
import pyopencl as cl
from pyopencl import array


#parameters
number_of_timesteps = 1000
number_of_elements = 10000000


#set up the inital conditions
torch_data = tr.rand((1,1,number_of_elements),dtype=tr.double) #torch convolution needs shape (minibatch,in_channels,iW)
numba_data = np.array([0] + list(torch_data[0][0].numpy()) + [0]) #add padding [0] for convolution. handled automatically in torch.
opencl_data = np.array([0] + list(torch_data[0][0].numpy()) + [0])


#test Torch
device = "cuda"
torch_data_a = torch_data.to(device)
torch_data_b = torch_data.to(device)
kernel = tr.tensor([[[1,-2,1]]],dtype=tr.double,device=device)
with tr.no_grad():
    start_time = time.time()
    for t in range(round(number_of_timesteps/2)): # /2 because each loop is two convolutions
        torch_data_b = torch_data_a + 0.1* tr.nn.functional.conv1d(torch_data_a,kernel,padding=1)
        torch_data_a = torch_data_b + 0.1* tr.nn.functional.conv1d(torch_data_b,kernel,padding=1)
    print("Torch GPU time:",time.time()-start_time)
    torch_data_numpy = np.array([0] + list(torch_data_a[0][0].cpu().numpy()) + [0])

#Numba GPU kernel
@cuda.jit
def numba_conv_cuda(x,x_new):
    gid = cuda.grid(1)
    if 0 < gid < x.size - 1 :  # Check array boundaries
        x_new[gid] = x[gid] + 0.1*(x[gid+1]+x[gid-1]-2*x[gid])

threadsperblock = 100
blockspergrid = (numba_data.size + (threadsperblock - 1)) // threadsperblock
x_a = cuda.to_device(numba_data)
x_b = cuda.to_device(numba_data)
start_time = time.time()
#actually run the kernel
for t in range(round(number_of_timesteps/2)): #again /2 because each loop is two convolutions
    numba_conv_cuda[blockspergrid, threadsperblock](x_a,x_b)
    numba_conv_cuda[blockspergrid, threadsperblock](x_b,x_a)
print("Numba GPU time:",time.time()-start_time)
numba_data = x_a.copy_to_host()


#test OpenCL
context = cl.create_some_context(interactive=False,answers=[0])
queue = cl.CommandQueue(context)
mem_flags = cl.mem_flags
program = cl.Program(context, """
    #pragma OPENCL EXTENSION cl_khr_fp64 : enable //enable double precision calculations
    __kernel void update_psi(__global const double *x, __global double *x_new)
    {
        int gid = get_global_id(0);
        if(0 < gid && gid < x.size - 1){
            x_new[gid] = x[gid] + 0.1*(x[gid+1]+x[gid-1]-2*x[gid]);
        }
    }
    """.replace("x.size",str(opencl_data.size))).build()
x_a_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=opencl_data)
x_b_buf = cl.Buffer(context, mem_flags.READ_WRITE | mem_flags.COPY_HOST_PTR, hostbuf=opencl_data)


#actually run the OpenCL
start_time = time.time()
for t in range(round(number_of_timesteps/2)): #again /2 because each loop is two convolutions
    event = program.update_psi(queue, [threadsperblock*blockspergrid], [threadsperblock], x_a_buf, x_b_buf)
    event.wait()
    event = program.update_psi(queue, [threadsperblock*blockspergrid], [threadsperblock], x_b_buf, x_a_buf)
    event.wait()
print("OpenCL GPU time:",time.time()-start_time)
event = cl.enqueue_copy(queue, opencl_data, x_a_buf)
event.wait()


print("Results are same?",np.allclose(torch_data_numpy,numba_data) and np.allclose(numba_data,opencl_data))

以下是在Nvidia GPU上测试的结果：

Torch GPU time: 13.544365406036377
Numba GPU time: 0.2404193878173828
OpenCL GPU time: 0.9025869369506836
Results are same? True

令我惊讶的是，结果表明，一个为应用卷积等操作而设计的库比Numba或PyOpenCL慢得多（由于没有使用GPU上的任何本地内存，因此甚至没有进行优化）。是真的吗？还是我做错了什么？

另外，为什么用c编写的内核比用Python编写的内核慢3倍？

为什么PyTorch比PyOpenCL慢，而PyOpenCL比GPU上的Numba慢？

0 个答案: