与pivo
相比,我试图通过指定另外两个列表分隔段来实现快速排序并行化。我在语法上遇到问题,并将指针保存在两个新列表的末尾。如何摆脱语法错误,并在内核末尾保存列表大小?
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements: list) -> list:
if len(listElements) <= 1:
return listElements
else:
pivo = listElements.pop()
list1 = []
list2 = []
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int pivo)
{
int index1 = 0, index2 = 0;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (lista[i] < pivo
{
list1[index2] = listElements[i];
index1++;
}
else
{
list2[index2] = listElements[i];
index2++;
}
}
"""
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(np.int)
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
BLOCK_SIZE = 256
NUM_BLOCKS = (SIZE + BLOCK_SIZE - 1) // BLOCK_SIZE
kernel_code = kernel_code_template % {
'ARRAY_SIZE': SIZE
}
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
arraysQuick(lista_gpu, list1_gpu, list2_gpu, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
list1 = list1_gpu.get()
list2 = list2_gpu.get()
np.allclose(list1, list1_gpu.get())
np.allclose(list2, list2_gpu.get())
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
这是运行时错误:
Traceback (most recent call last):
File "C:/Users/mateu/Documents/GitHub/ppc_Sorting_and_Merging/quickSort.py", line 104, in <module>
print(quickSort_paraleloGlobal([1, 5, 4, 2, 0]))
File "C:/Users/mateu/Documents/GitHub/ppc_Sorting_and_Merging/quickSort.py", line 60, in quickSort_paraleloGlobal
mod = compiler.SourceModule(kernel_code)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\compiler.py", line 291, in __init__
arch, code, cache_dir, include_dirs)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\compiler.py", line 254, in compile
return compile_plain(source, options, keep, nvcc, cache_dir, target)
File "C:\Users\mateu\Documents\GitHub\ppc_Sorting_and_Merging\venv\lib\site-packages\pycuda\compiler.py", line 137, in compile_plain
stderr=stderr.decode("utf-8", "replace"))
pycuda.driver.CompileError: nvcc compilation of C:\Users\mateu\AppData\Local\Temp\tmpefxgkfkk\kernel.cu failed
[command: nvcc --cubin -arch sm_61 -m64 -Ic:\users\mateu\documents\github\ppc_sorting_and_merging\venv\lib\site-packages\pycuda\cuda kernel.cu]
[stdout:
kernel.cu
]
[stderr:
kernel.cu(10): error: expected a ")"
kernel.cu(19): warning: parsing restarts here after previous syntax error
kernel.cu(19): error: expected a statement
kernel.cu(5): warning: variable "indexMenor" was declared but never referenced
kernel.cu(5): warning: variable "indexMaior" was declared but never referenced
2 errors detected in the compilation of "C:/Users/mateu/AppData/Local/Temp/tmpxft_00004260_00000000-10_kernel.cpp1.ii".
]
Process finished with exit code 1
答案 0 :(得分:1)
您的代码有很多问题。我认为我无法将它们全部列出。但是,中心问题之一是您试图将串行快速排序简单地转换为线程并行快速排序,而这种简单的转换是不可能的。
要允许线程以并行方式工作,同时将输入列表分为两个单独的输出列表之一,需要对内核代码进行许多更改。
但是,我们可以通过将内核启动限制为每个线程来解决大多数其他问题。
有了这个主意,下面的代码似乎可以正确地排序给定的输入:
$ cat t18.py
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements):
if len(listElements) <= 1:
return listElements
else:
pivo = listElements.pop()
pivo = np.int32(pivo)
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int *l1_size, int *l2_size, int pivo)
{
int index1 = 0, index2 = 0;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (listElements[i] < pivo)
{
list1[index1] = listElements[i];
index1++;
}
else
{
list2[index2] = listElements[i];
index2++;
}
*l1_size = index1;
*l2_size = index2;
}
"""
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(np.int32)
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
l1_size = cuda.mem_alloc(4)
l2_size = cuda.mem_alloc(4)
BLOCK_SIZE = 1
NUM_BLOCKS = 1
kernel_code = kernel_code_template % {
'ARRAY_SIZE': SIZE
}
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
arraysQuick(lista_gpu, list1_gpu, list2_gpu, l1_size, l2_size, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
l1_sh = np.zeros(1, dtype = np.int32)
l2_sh = np.zeros(1, dtype = np.int32)
cuda.memcpy_dtoh(l1_sh, l1_size)
cuda.memcpy_dtoh(l2_sh, l2_size)
list1 = np.zeros(l1_sh, dtype=np.int32)
list2 = np.zeros(l2_sh, dtype=np.int32)
cuda.memcpy_dtoh(list1, list1_gpu)
cuda.memcpy_dtoh(list2, list2_gpu)
list1 = list1.tolist()
list2 = list2.tolist()
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
print(quickSort_paralleloGlobal([1, 5, 4, 2, 0]))
$ python t18.py
[0, 1, 2, 4, 5]
$
移植过程的下一步是将您的幼稚串行内核转换为可以以线程并行方式运行的内核。一种相对简单的方法是使用原子来管理所有输出数据(两个列表,以及每个列表大小的更新)。
这是一种可能的方法:
$ cat t18.py
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda import gpuarray, compiler
from pycuda.compiler import SourceModule
import time
import numpy as np
def quickSort_paralleloGlobal(listElements):
if len(listElements) <= 1:
return listElements
else:
pivo = listElements.pop()
pivo = np.int32(pivo)
kernel_code_template = """
__global__ void separateQuick(int *listElements, int *list1, int *list2, int *l1_size, int *l2_size, int pivo)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < %(ARRAY_SIZE)s; i+= stride)
if (listElements[i] < pivo)
{
list1[atomicAdd(l1_size, 1)] = listElements[i];
}
else
{
list2[atomicAdd(l2_size, 1)] = listElements[i];
}
}
"""
SIZE = len(listElements)
listElements = np.asarray(listElements)
listElements = listElements.astype(np.int32)
lista_gpu = cuda.mem_alloc(listElements.nbytes)
cuda.memcpy_htod(lista_gpu, listElements)
list1_gpu = cuda.mem_alloc(listElements.nbytes)
list2_gpu = cuda.mem_alloc(listElements.nbytes)
l1_size = cuda.mem_alloc(4)
l2_size = cuda.mem_alloc(4)
BLOCK_SIZE = 256
NUM_BLOCKS = (SIZE + BLOCK_SIZE - 1) // BLOCK_SIZE
kernel_code = kernel_code_template % {
'ARRAY_SIZE': SIZE
}
mod = compiler.SourceModule(kernel_code)
arraysQuick = mod.get_function("separateQuick")
l1_sh = np.zeros(1, dtype = np.int32)
l2_sh = np.zeros(1, dtype = np.int32)
cuda.memcpy_htod(l1_size, l1_sh)
cuda.memcpy_htod(l2_size, l2_sh)
arraysQuick(lista_gpu, list1_gpu, list2_gpu, l1_size, l2_size, pivo, block=(BLOCK_SIZE, 1, 1), grid=(NUM_BLOCKS, 1))
cuda.memcpy_dtoh(l1_sh, l1_size)
cuda.memcpy_dtoh(l2_sh, l2_size)
list1 = np.zeros(l1_sh, dtype=np.int32)
list2 = np.zeros(l2_sh, dtype=np.int32)
cuda.memcpy_dtoh(list1, list1_gpu)
cuda.memcpy_dtoh(list2, list2_gpu)
list1 = list1.tolist()
list2 = list2.tolist()
return quickSort_paralleloGlobal(list1) + [pivo] + quickSort_paralleloGlobal(list2)
print(quickSort_paralleloGlobal([1, 5, 4, 2, 0]))
$ python t18.py
[0, 1, 2, 4, 5]
$
我并不是说上述示例是完美的或无缺陷的。另外,我还没有确定我对您的代码所做的每一项更改。我建议您研究这些示例与您发布的代码之间的区别。
我还应该提到,这不是在GPU上对数字进行排序的快速或有效方法。我认为这是一个学习练习。如果您对快速并行排序感兴趣,建议您使用库实现。如果您想从python进行此操作,cupy
提供了一种可能的实现