Question

我的代码中的瓶颈是找到N个数组的索引交集;数百万次。使用np.intersect1d进行简单的NumPy计算但运行数百万次会产生费用。

一个例子：

arr1 = [0,1,2,3,4]
arr2 = [0,3,4]
arr3 = [3,4]

交叉点是[3,4]

我想利用GPU线程但在实施方面苦苦挣扎...... 欢迎使用CUDA，OpenCL，Numba和/或其他解决方案。

这是python代码：

import functools, datetime
import numpy as np


def run():
    """
    Create fake-data variable `grouped_data` which is a list of 100k entries. 
    Each element has 3 numpy arrays that are UNIQUE AND SORTED.

    Goal: iterate through `grouped_data` to find intersecting values per element.
    Ie, length of output equals length of input, `grouped_data`.
    In each element, these common values will be used to slice another numpy 
    array which is not included here.

    *Question*: how can this be moved to the GPU?  I'd like to leverage GPU threads.
    CUDA, OpenCL, Numba and/or `other` solutions welcome.
    """
    grouped_data = create_data()                            # 9% of runtime
    overlap = loop_through_intersections(grouped_data)      # 91% of runtime



def create_data():
    """ Return `grouped_data`, list of 100k entries. Each element has 3 numpy arrays 
    kern profiler shows this function takes ~ 9% of runtime """
    array = np.array(range(2000))

    grouped_data = []
    for i in range(100000):
        ar1 = array[::np.random.randint(1,9)]
        ar2 = array[::np.random.randint(1,9)]
        ar3 = array[::np.random.randint(1,9)]

        grouped_data.append( [ar1, ar2, ar3] )

    return grouped_data


def loop_through_intersections(grouped_data):
    """  for each element in grouped_data (3 numpy arrays), find the intersecting values 
    kern profiler shows this function takes ~ 91% of runtime 
    """
    overlap = []
    for f in grouped_data:
        overlap.append( functools.reduce(intersect1d, f) )
    return overlap


def intersect1d(ar1, ar2):
    """
    Find the intersection of two arrays.
    Return the sorted, unique values that are in both of the input arrays.

    Taken from NumPy.   https://github.com/numpy/numpy/blob/v1.14.0/numpy/lib/arraysetops.py#L297-L338
    """
    aux = np.concatenate((ar1, ar2))
    aux.sort()
    return aux[:-1][aux[1:] == aux[:-1]]




####################################################
#            Runtime takes ~6s 
####################################################

st = datetime.datetime.now()
run();  print datetime.datetime.now() - st

我也愿意改变输入。例如，我可以将list grouped_data列表转换为矩阵。

欢迎所有GPU解决方案。

**

更新 - CUDA ATTEMPT

**

第一次更新，我将数据转换为矩阵（vs列表列表），以将数组传递给GPU。

第二次更新，为简单起见，现在样本数据要小得多。

第3次更新，我正在学习CUDA并写了一个简单的内核，但行为是出乎意料的......

我的内核每个输出列应该有1个线程。对于第一个线程（值0），取输入矩阵列0,1,2并找到值交集。如果所有都是1连续，则将输出行设置为1，否则不执行任何操作。

目前输出是出乎意料的，我不知道为什么。任何想法??

import numpy as np

import pycuda.driver as drv
import pycuda.autoinit
from pycuda.compiler import SourceModule
from pycuda import driver, compiler, gpuarray, tools


def create_data(rows, iterations):
    np.random.seed(42)
    array = np.array(range(rows))

    grouped_data = []
    for i in range(iterations):
        for j in range(3):
            index = np.zeros(rows, dtype=bool)
            index[ array[::np.random.randint(1,4)] ] = True        
            grouped_data.append( index )

    #matrix = np.array( np.array(grouped_data).T, order='F')
    matrix = np.array( np.array(grouped_data).T, dtype=np.float32)
    return matrix



def get_kernel_code(rows, iterations):

    kernel_code = """
    __global__ void MyKernel(int rows, float matrix[MATRIX_ROWS][MATRIX_COLS], float output[OUTPUT_ROWS][OUTPUT_COLS])
    {
        const int thread = blockIdx.x * blockDim.x + threadIdx.x;


        if (thread < rows){

            int col1 = thread*3;
            int col2 = thread*3+1;
            int col3 = thread*3+2;

            for (int i=0; i<rows; i++) {
                if (matrix[i][col1]==1 && matrix[i][col1]==matrix[i][col2] && matrix[i][col2]==matrix[i][col3]) {
                    output[i][thread] = 1; }
            }
        }

    }
    """

    kernel_code = kernel_code.replace('MATRIX_ROWS', str(rows) )
    kernel_code = kernel_code.replace('MATRIX_COLS', str(iterations*3) )
    kernel_code = kernel_code.replace('OUTPUT_ROWS', str(rows) )
    kernel_code = kernel_code.replace('OUTPUT_COLS', str(iterations) )

    return kernel_code



def cuda_attempt(rows, iterations):
    """
    Create data, use gpuarray, get pycuda result. 
    """

    # Setup data
    kernel_code = get_kernel_code(rows, iterations)
    np.random.seed(42)
    matrix = create_data(rows, iterations).astype(np.float32)


    # Transfer host (CPU) memory to device (GPU) memory
    input = gpuarray.to_gpu(matrix)
    output = gpuarray.empty((rows, iterations), np.float32)


    # Compile the kernel code 
    mod = compiler.SourceModule(kernel_code)
    intersect = mod.get_function("MyKernel")


    # Define Thread & Block Size
    number_threads = output.shape[1]
    number_blocks = 1

    intersect(
        np.int32(rows), input, output,
        block=(number_blocks,number_threads,1)
        )

    gpu_output = output.get()
    print '\n output col0 which is the intersection of first 3 input columns\n', gpu_output[:, :1]
    print '\n should be \n', np.array([1, 0,0,1,0,0,1,0,0,1], dtype=float)


    old = input.get()
    print '\n Matrix Input for 1st Grouping of 3 \n', old[:, 0:3]

    return



cuda_attempt(rows=10, iterations=2)

Answer 1

以下是一个解决方案。良好的学习经历。

GPU代码比Numba快5倍。没有我想要的那么好...... 我仍然可以优化块和网格大小，但现在将离开。

import numpy as np
import datetime
from numba import njit

from pycuda import driver, compiler, gpuarray, tools
import pycuda.autoinit 



def compare(rows, iterations):
    """
    Run CPU & GPU Version.  Compare output.
    Creates binary matrix called a_cpu which represents a dataset.
    The goal is to take 3 columns at a time and if all are 1, pass 1
    to the output matrix.
    """

    np.random.seed(42)
    a_cpu = np.random.randint(0,2, (rows, iterations*3)).astype(np.float32)


    st = datetime.datetime.now()
    cpu = np.zeros((rows, iterations), dtype=int)
    iterate_over_matrix(a_cpu, iterations, rows, cpu)
    print '\n\t CPU runtime: ', datetime.datetime.now() - st


    st = datetime.datetime.now()
    gpu = cuda_attempt(rows, iterations, a_cpu)
    print '\n\t GPU runtime: ', datetime.datetime.now() - st

    print "cpu.sum(): {:,}".format(cpu.sum())
    print "gpu.sum(): {:,}".format(int(gpu.sum()))



def get_kernel_code(iterations):

    kernel_code = """
    __global__ void MatrixMulKernel(int ROWS, float *A, float *C)
    {
        const int wC = %(C_SIZE)s;

        const int blockId = blockIdx.y * gridDim.x + blockIdx.x;
        const int thread = blockId * blockDim.x + threadIdx.x;


        if ( thread < (ROWS * wC) ) {
            float Aele = A[3*thread] * A[3*thread +1] * A[3*thread +2];
            C[thread] = Aele;
        }
    }
    """

    kernel_code = kernel_code % { 
        'A_SIZE': 3*iterations,
        'C_SIZE': iterations,
        }

    return kernel_code



def cuda_attempt(rows, iterations, a_cpu):
    """
    Create data, use gpuarray, get pycuda result. 
    """

    a_gpu = gpuarray.to_gpu(a_cpu) 
    c_gpu = gpuarray.empty((rows, iterations), np.float32)


    kernel_code = get_kernel_code(iterations)
    mod = compiler.SourceModule(kernel_code)
    matrixmul = mod.get_function("MatrixMulKernel")


    # 2D Grid of 1D Blocks
    needed_threads = rows * iterations
    threads = 1024
    number_blocks = needed_threads // threads + 1
    number_blocks = int(np.sqrt(number_blocks)) + 1

    assert (number_blocks <= 65535), "number of blocks exceeds allowed limit in 1 dimension"


    grid = (number_blocks, number_blocks)
    block = (threads, 1, 1)

    matrixmul(
        np.int32(rows), a_gpu, c_gpu, 
        grid = grid,
        block = block,
        )

    return c_gpu.get()



#===============================================================================
#                    CPU CALCULATTIONS
#===============================================================================

@njit
def iterate_over_matrix(matrix, iterations, rows, bools):
    for i in range(iterations):
        arr = matrix[:, i*3:(i*3+3)]
        check_intersection(bools[:, i], arr[:, 0], arr[:, 1], arr[:, 2], rows)



@njit
def check_intersection(index, ar1, ar2, ar3, rows):
    for i in range(rows):
        if ar1[i] == ar2[i] == ar3[i] == True:
            index[i] = True





#===============================================================================
#                    RUN
#===============================================================================


rows=5
iterations=2


rows=2000
iterations=100000


compare(rows, iterations)

GPU上的NumPy N阵列交集

更新 - CUDA ATTEMPT

1 个答案: