我的代码中的瓶颈是找到N个数组的索引交集;数百万次。使用np.intersect1d进行简单的NumPy计算但运行数百万次会产生费用。
一个例子:
arr1 = [0,1,2,3,4]
arr2 = [0,3,4]
arr3 = [3,4]
交叉点是[3,4]
我想利用GPU线程但在实施方面苦苦挣扎...... 欢迎使用CUDA,OpenCL,Numba和/或其他解决方案。
这是python代码:
import functools, datetime
import numpy as np
def run():
"""
Create fake-data variable `grouped_data` which is a list of 100k entries.
Each element has 3 numpy arrays that are UNIQUE AND SORTED.
Goal: iterate through `grouped_data` to find intersecting values per element.
Ie, length of output equals length of input, `grouped_data`.
In each element, these common values will be used to slice another numpy
array which is not included here.
*Question*: how can this be moved to the GPU? I'd like to leverage GPU threads.
CUDA, OpenCL, Numba and/or `other` solutions welcome.
"""
grouped_data = create_data() # 9% of runtime
overlap = loop_through_intersections(grouped_data) # 91% of runtime
def create_data():
""" Return `grouped_data`, list of 100k entries. Each element has 3 numpy arrays
kern profiler shows this function takes ~ 9% of runtime """
array = np.array(range(2000))
grouped_data = []
for i in range(100000):
ar1 = array[::np.random.randint(1,9)]
ar2 = array[::np.random.randint(1,9)]
ar3 = array[::np.random.randint(1,9)]
grouped_data.append( [ar1, ar2, ar3] )
return grouped_data
def loop_through_intersections(grouped_data):
""" for each element in grouped_data (3 numpy arrays), find the intersecting values
kern profiler shows this function takes ~ 91% of runtime
"""
overlap = []
for f in grouped_data:
overlap.append( functools.reduce(intersect1d, f) )
return overlap
def intersect1d(ar1, ar2):
"""
Find the intersection of two arrays.
Return the sorted, unique values that are in both of the input arrays.
Taken from NumPy. https://github.com/numpy/numpy/blob/v1.14.0/numpy/lib/arraysetops.py#L297-L338
"""
aux = np.concatenate((ar1, ar2))
aux.sort()
return aux[:-1][aux[1:] == aux[:-1]]
####################################################
# Runtime takes ~6s
####################################################
st = datetime.datetime.now()
run(); print datetime.datetime.now() - st
我也愿意改变输入。例如,我可以将list grouped_data列表转换为矩阵。
欢迎所有GPU解决方案。
**
**
第一次更新,我将数据转换为矩阵(vs列表列表),以将数组传递给GPU。
第二次更新,为简单起见,现在样本数据要小得多。
第3次更新,我正在学习CUDA并写了一个简单的内核,但行为是出乎意料的......
我的内核每个输出列应该有1个线程。 对于第一个线程(值0),取输入矩阵列0,1,2并找到值交集。如果所有都是1连续,则将输出行设置为1,否则不执行任何操作。
目前输出是出乎意料的,我不知道为什么。 任何想法??
import numpy as np
import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import driver, compiler, gpuarray, tools
def create_data(rows, iterations):
np.random.seed(42)
array = np.array(range(rows))
grouped_data = []
for i in range(iterations):
for j in range(3):
index = np.zeros(rows, dtype=bool)
index[ array[::np.random.randint(1,4)] ] = True
grouped_data.append( index )
#matrix = np.array( np.array(grouped_data).T, order='F')
matrix = np.array( np.array(grouped_data).T, dtype=np.float32)
return matrix
def get_kernel_code(rows, iterations):
kernel_code = """
__global__ void MyKernel(int rows, float matrix[MATRIX_ROWS][MATRIX_COLS], float output[OUTPUT_ROWS][OUTPUT_COLS])
{
const int thread = blockIdx.x * blockDim.x + threadIdx.x;
if (thread < rows){
int col1 = thread*3;
int col2 = thread*3+1;
int col3 = thread*3+2;
for (int i=0; i<rows; i++) {
if (matrix[i][col1]==1 && matrix[i][col1]==matrix[i][col2] && matrix[i][col2]==matrix[i][col3]) {
output[i][thread] = 1; }
}
}
}
"""
kernel_code = kernel_code.replace('MATRIX_ROWS', str(rows) )
kernel_code = kernel_code.replace('MATRIX_COLS', str(iterations*3) )
kernel_code = kernel_code.replace('OUTPUT_ROWS', str(rows) )
kernel_code = kernel_code.replace('OUTPUT_COLS', str(iterations) )
return kernel_code
def cuda_attempt(rows, iterations):
"""
Create data, use gpuarray, get pycuda result.
"""
# Setup data
kernel_code = get_kernel_code(rows, iterations)
np.random.seed(42)
matrix = create_data(rows, iterations).astype(np.float32)
# Transfer host (CPU) memory to device (GPU) memory
input = gpuarray.to_gpu(matrix)
output = gpuarray.empty((rows, iterations), np.float32)
# Compile the kernel code
mod = compiler.SourceModule(kernel_code)
intersect = mod.get_function("MyKernel")
# Define Thread & Block Size
number_threads = output.shape[1]
number_blocks = 1
intersect(
np.int32(rows), input, output,
block=(number_blocks,number_threads,1)
)
gpu_output = output.get()
print '\n output col0 which is the intersection of first 3 input columns\n', gpu_output[:, :1]
print '\n should be \n', np.array([1, 0,0,1,0,0,1,0,0,1], dtype=float)
old = input.get()
print '\n Matrix Input for 1st Grouping of 3 \n', old[:, 0:3]
return
cuda_attempt(rows=10, iterations=2)
答案 0 :(得分:1)
以下是一个解决方案。良好的学习经历。
GPU代码比Numba快5倍。没有我想要的那么好...... 我仍然可以优化块和网格大小,但现在将离开。
import numpy as np
import datetime
from numba import njit
from pycuda import driver, compiler, gpuarray, tools
import pycuda.autoinit
def compare(rows, iterations):
"""
Run CPU & GPU Version. Compare output.
Creates binary matrix called a_cpu which represents a dataset.
The goal is to take 3 columns at a time and if all are 1, pass 1
to the output matrix.
"""
np.random.seed(42)
a_cpu = np.random.randint(0,2, (rows, iterations*3)).astype(np.float32)
st = datetime.datetime.now()
cpu = np.zeros((rows, iterations), dtype=int)
iterate_over_matrix(a_cpu, iterations, rows, cpu)
print '\n\t CPU runtime: ', datetime.datetime.now() - st
st = datetime.datetime.now()
gpu = cuda_attempt(rows, iterations, a_cpu)
print '\n\t GPU runtime: ', datetime.datetime.now() - st
print "cpu.sum(): {:,}".format(cpu.sum())
print "gpu.sum(): {:,}".format(int(gpu.sum()))
def get_kernel_code(iterations):
kernel_code = """
__global__ void MatrixMulKernel(int ROWS, float *A, float *C)
{
const int wC = %(C_SIZE)s;
const int blockId = blockIdx.y * gridDim.x + blockIdx.x;
const int thread = blockId * blockDim.x + threadIdx.x;
if ( thread < (ROWS * wC) ) {
float Aele = A[3*thread] * A[3*thread +1] * A[3*thread +2];
C[thread] = Aele;
}
}
"""
kernel_code = kernel_code % {
'A_SIZE': 3*iterations,
'C_SIZE': iterations,
}
return kernel_code
def cuda_attempt(rows, iterations, a_cpu):
"""
Create data, use gpuarray, get pycuda result.
"""
a_gpu = gpuarray.to_gpu(a_cpu)
c_gpu = gpuarray.empty((rows, iterations), np.float32)
kernel_code = get_kernel_code(iterations)
mod = compiler.SourceModule(kernel_code)
matrixmul = mod.get_function("MatrixMulKernel")
# 2D Grid of 1D Blocks
needed_threads = rows * iterations
threads = 1024
number_blocks = needed_threads // threads + 1
number_blocks = int(np.sqrt(number_blocks)) + 1
assert (number_blocks <= 65535), "number of blocks exceeds allowed limit in 1 dimension"
grid = (number_blocks, number_blocks)
block = (threads, 1, 1)
matrixmul(
np.int32(rows), a_gpu, c_gpu,
grid = grid,
block = block,
)
return c_gpu.get()
#===============================================================================
# CPU CALCULATTIONS
#===============================================================================
@njit
def iterate_over_matrix(matrix, iterations, rows, bools):
for i in range(iterations):
arr = matrix[:, i*3:(i*3+3)]
check_intersection(bools[:, i], arr[:, 0], arr[:, 1], arr[:, 2], rows)
@njit
def check_intersection(index, ar1, ar2, ar3, rows):
for i in range(rows):
if ar1[i] == ar2[i] == ar3[i] == True:
index[i] = True
#===============================================================================
# RUN
#===============================================================================
rows=5
iterations=2
rows=2000
iterations=100000
compare(rows, iterations)