Question

  answer_array = np.zeros_like(self.redarray)
        answer_array_gpu = cuda.mem_alloc(answer_array.nbytes)
        redarray_gpu = cuda.mem_alloc(self.redcont.nbytes)
        greenarray_gpu = cuda.mem_alloc(self.greencont.nbytes)
        bluearray_gpu = cuda.mem_alloc(self.bluecont.nbytes)
        cuda.memcpy_htod(redarray_gpu, self.redcont)
        cuda.memcpy_htod(greenarray_gpu, self.greencont)
        cuda.memcpy_htod(bluearray_gpu, self.bluecont)
        cuda.memcpy_htod(answer_array_gpu, answer_array)

        desaturate_mod = SourceModule("""
            __global__ void array_desaturation(float *a, float *b, float *c, float *d){
                int index = blockIdx.x * blockDim.x + threadIdx.x;
                d[index] = ((a[index] + b[index] + c[index])/3);
            }
        """)

        func = desaturate_mod.get_function("array_desaturation")
        func(redarray_gpu, greenarray_gpu, bluearray_gpu, answer_array_gpu,
             block=(self.gpu_threads, self.gpu_threads, self.blocks_to_use))
        desaturated = np.empty_like(self.redarray)
        cuda.memcpy_dtoh(desaturated, answer_array_gpu)
        print(desaturated)
        print("Up to here")

我编写了这段代码，用于查找三个数组的平均值，并将其保存到第四个数组中。该代码既不打印结果，也不打印“ Up to here”行。可能是什么错误？

其他信息：Redarray，greenarray和bluearray是float32 numpy数组

Answer 1

我知道开始使用C数组，尤其是在PyCUDA中可能很棘手，花了几个月的时间才能使2D滑动最大算法起作用。

在此示例中，您无法像在Python中那样访问数组元素，在Python中您只能提供索引，因为您正在将指向内存地址的指针传递给每个数组中的第一个元素。可以在here中找到有关如何在C中工作的有用示例。您还必须传递数组的长度（假设它们都相等，以便我们不会超出范围），如果它们的长度不同，则分别将它们全部。

希望，您可以了解如何通过链接从C中的指针访问数组元素。然后@talonmies提供了一个很好的示例here，该示例介绍了如何传递2D数组（这与1D数组相同，因为2D数组在GPU的内存中被展平为1D数组）。但是，当我使用此工具时，我从来没有像@talonmies那样大步前进，就像TutorialsPoint教程说*(pointer_to_array + index)是正确的那样。在这里提供记忆的步伐将使您越界。

因此，我的代码看起来更像：

C_Code = """
            __global__ void array_desaturation(float *array_A, float *array_B, float *array_C, float *outputArray, int arrayLengths){
                int index = blockIdx.x * blockDim.x + threadIdx.x;
                if(index >= arrayLengths){ // In case our threads created would be outwise out of the bounds for our memory, if we did we would have some serious unpredictable problems
                    return;
                }

                // These variables will get the correct values from the arrays at the appropriate index relative to their unique memory addresses (You could leave this part out but I like the legibility)
                float aValue = *(array_A + index);
                float bValue = *(array_B + index);
                float cValue = *(array_C + index);

                *(outputArray + index) = ((aValue + bValue + cValue)/3); //Set the (output arrays's pointer + offset)'s value to our average value
                }"""


desaturate_mod = SourceModule(C_Code)
desaturate_kernel = desaturate_mod.get_function("array_desaturation")

desaturate_kernel(cuda.In(array_A),                    # Input
                  cuda.In(array_B),                    # Input
                  cuda.In(array_C),                    # Input
                  cuda.Out(outputArray),               # Output
                  numpy.int32(len(array_A)),           # Array Size if they are all the same length
                  block=(blockSize[0],blockSize[1],1), # However you want for the next to parameters but change your index accordingly
                  grid=(gridSize[0],gridSize[1],1)
                  )

print(outputArray) # Done! Make sure you have defined all these arrays before ofc

PyCuda程序继续运行

1 个答案: