Question

我们如何将具有多维度的数组复制到AleaGPU中的内核？我们如何在内核中使用多维数组进行开发？

Malloc似乎不接受它？

double[,] inputs;
double[,] dInputs1 = Worker.Malloc(inputs);    // I get an error here
var dOutputs1 = Worker.Malloc<double>(inputs1.Length)
Worker.Launch(SquareKernel, lp, dOutputs1.Ptr, dInputs1.Ptr,  inputs.Length);  //dInputs1.Ptr Make an error

[AOTCompile]
static void SquareKernel(deviceptr<double> outputs, deviceptr<double[,]> inputs, int n)
{
    var start = blockIdx.x * blockDim.x + threadIdx.x;
    var stride = gridDim.x * blockDim.x;
    for (var i = start; i < n; i += stride)
    {
        outputs[i] = inputs[i,0] * inputs[i,0];  
    }
}

Answer 1

Alea GPU版本直到2.2（现在最新）还不支持malloc array2d，所以你必须在内核中按行和列自己压缩索引。对于主机端，您可以使用一些扩展方法，使用一些CUDA驱动程序API P / Invoke（这些P / Invoke功能可从Alea.CUDA.dll获得）来将固定的.NET阵列传送到设备或从设备传送。

所以这是我写的快速工作：

using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using Alea.CUDA;
using Alea.CUDA.IL;
using NUnit.Framework;

namespace ConsoleApplication1
{
    static class Extension
    {
        public static DeviceMemory<T> Malloc<T>(this Worker worker, T[,] array2D)
        {
            var rows = array2D.GetLength(0);
            var cols = array2D.GetLength(1);
            var dmem = worker.Malloc<T>(rows*cols);

            var handle = GCHandle.Alloc(array2D, GCHandleType.Pinned);
            try
            {
                var hostPtr = handle.AddrOfPinnedObject();
                var devicePtr = dmem.Handle;
                // we now pinned .NET array, and need to copy them with CUDA Driver API
                // to do so we need use worker.Eval to make sure the worker's context is
                // pushed onto current thread.
                worker.EvalAction(() =>
                {
                    CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyHtoD(devicePtr, hostPtr,
                        new IntPtr(Intrinsic.__sizeof<T>()*rows*cols)));
                });
            }
            finally
            {
                handle.Free();
            }

            return dmem;
        }

        public static DeviceMemory<T> Malloc<T>(this Worker worker, int rows, int cols)
        {
            return worker.Malloc<T>(rows*cols);
        }

        public static void Gather<T>(this DeviceMemory<T> dmem, T[,] array2D)
        {
            var rows = array2D.GetLength(0);
            var cols = array2D.GetLength(1);

            var handle = GCHandle.Alloc(array2D, GCHandleType.Pinned);
            try
            {
                var hostPtr = handle.AddrOfPinnedObject();
                var devicePtr = dmem.Handle;
                // we now pinned .NET array, and need to copy them with CUDA Driver API
                // to do so we need use worker.Eval to make sure the worker's context is
                // pushed onto current thread.
                dmem.Worker.EvalAction(() =>
                {
                    CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyDtoH(hostPtr, devicePtr,
                        new IntPtr(Intrinsic.__sizeof<T>() * rows * cols)));
                });
            }
            finally
            {
                handle.Free();
            }
        }
    }

    class Program
    {
        static int FlattenIndex(int row, int col, int cols)
        {
            return row*cols + col;
        }

        [AOTCompile]
        static void Kernel(deviceptr<double> outputs, deviceptr<double> inputs, int rows, int cols)
        {
            // for simplicity, I do all things in one thread.
            for (var row = 0; row < rows; row++)
            {
                for (var col = 0; col < cols; col++)
                {
                    outputs[FlattenIndex(row, col, cols)] = inputs[FlattenIndex(row, col, cols)];
                }
            }
        }

        [Test]
        public static void Test()
        {
            var worker = Worker.Default;
            // make it small, for we only do it in one GPU thread.
            const int rows = 10;
            const int cols = 5;
            var rng = new Random();
            var inputs = new double[rows, cols];
            for (var row = 0; row < rows; ++row)
            {
                for (var col = 0; col < cols; ++col)
                {
                    inputs[row, col] = rng.Next(1, 100);
                }
            }
            var dInputs = worker.Malloc(inputs);
            var dOutputs = worker.Malloc<double>(rows, cols);
            var lp = new LaunchParam(1, 1);
            worker.Launch(Kernel, lp, dOutputs.Ptr, dInputs.Ptr, rows, cols);
            var outputs = new double[rows, cols];
            dOutputs.Gather(outputs);
            Assert.AreEqual(inputs, outputs);
        }

        public static void Main(string[] args)
        {

        }
    }
}

Answer 2

这很棒!!!!!! ，效果很好!!!

非常感谢您的回答，这非常有用！

我不知道我们可以从内核调用像FlattenIndex这样的 Not-AOTCompile 函数!!!

这非常有趣且有用。这意味着我们可以在GPU上使用更多功能。

通过您的示例，我向类Extension添加了一个函数3维Array（对于使用2 GPU运行的示例）：

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using Alea.CUDA;
using Alea.CUDA.Utilities;
using Alea.CUDA.IL;
using NUnit.Framework;
using System.Linq;
using System.Threading;
using System.Runtime.InteropServices;

namespace WindowsFormsApplication2
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        #region ********************************************** 1 GPU ************************************************************************************

        private void button1_Click(object sender, EventArgs e)
        {
            textBox1.Text = "";
            var inputs = Enumerable.Range(0, 101).Select(i => -5.0 + i * 0.1).ToArray();
            var inputs1 = Enumerable.Range(0, 101).Select(i => -5.0 + i * 0.1).ToArray();
            var inputs2 = Enumerable.Range(0, 101).Select(i => -15.0 + i * 0.1).ToArray();
            var outputs = SquareGPU(inputs, inputs1);
            textBox1.Text = "Ok";
        }

        static double[] SquareGPU(double[] inputs, double[] inputs1)
        {
            var worker1 = Worker.Get(0);

            var dInputs1 = worker1.Malloc(inputs);
            var dOutputs1 = worker1.Malloc<double>(inputs1.Length);

            const int blockSize = 256;
            var numSm = worker1.Device.Attributes.MULTIPROCESSOR_COUNT;
            var gridSize = Math.Min(16 * numSm, Common.divup(inputs.Length, blockSize));
            var lp = new LaunchParam(gridSize, blockSize);

            worker1.Launch(SquareKernel, lp, dOutputs1.Ptr, dInputs1.Ptr, inputs.Length);

            return dOutputs1.Gather();

            //var worker = Worker.Default;
            //using (var dInputs = worker.Malloc(inputs))
            //using (var dOutputs = worker.Malloc<double>(inputs.Length))
            //{
            //    const int blockSize = 256;
            //    var numSm = worker.Device.Attributes.MULTIPROCESSOR_COUNT;
            //    var gridSize = Math.Min(16 * numSm, Common.divup(inputs.Length, blockSize));
            //    var lp = new LaunchParam(gridSize, blockSize);
            //    worker.Launch(SquareKernel, lp, dOutputs.Ptr, dInputs.Ptr, inputs.Length);
            //    return dOutputs.Gather();
            //}
        }

        #endregion

        #region ********************************************** 2 GPU ************************************************************************************

        private void button2_Click(object sender, EventArgs e)
        {
            textBox2.Text = "";
            ThreadStart thread0a = new ThreadStart(GPU1);
            Thread thread0b = new Thread(thread0a);
            thread0b.Start();

            ThreadStart thread1a = new ThreadStart(GPU2);
            Thread thread1b = new Thread(thread1a);
            thread1b.Start();

            Boolean j1 = thread0b.Join(10000);
            Boolean j2 = thread1b.Join(10000);

            if (!j1 || !j2)
            {
                textBox2.Text = "Erreur";
            }
            else
            {
                textBox2.Text = "Ok";
            }
        }
        public static void GPU1()
        {
            var inputs = Enumerable.Range(0, 101).Select(i => -5.0 + i * 0.1).ToArray();
            var inputs1 = Enumerable.Range(0, 101).Select(i => -5.0 + i * 0.1).ToArray();
            var inputs2 = Enumerable.Range(0, 101).Select(i => -15.0 + i * 0.1).ToArray();
            var outputs = SquareGPU2(Worker.Get(0), inputs, inputs1, inputs2);
        }
        public static void GPU2()
        {
            var inputs = Enumerable.Range(0, 101).Select(i => -5.0 + i * 0.1).ToArray();
            var inputs1 = Enumerable.Range(0, 101).Select(i => -5.0 + i * 0.1).ToArray();
            var inputs2 = Enumerable.Range(0, 101).Select(i => -15.0 + i * 0.1).ToArray();
            var outputs = SquareGPU2(Worker.Get(1), inputs, inputs1, inputs2);
        }

        static double[] SquareGPU2(Worker Worker, double[] inputs, double[] inputs1, double[] inputs2)
        {
            var dInputs1 = Worker.Malloc(inputs);
            var dOutputs1 = Worker.Malloc<double>(inputs1.Length);

            const int blockSize = 256;
            var numSm = Worker.Device.Attributes.MULTIPROCESSOR_COUNT;
            var gridSize = Math.Min(16 * numSm, Common.divup(inputs.Length, blockSize));
            var lp = new LaunchParam(gridSize, blockSize);

            Worker.Launch(SquareKernel, lp, dOutputs1.Ptr, dInputs1.Ptr, inputs.Length);

            return dOutputs1.Gather();
        }

        #endregion

        #region ********************************************** Array 2 Dimension ************************************************************************
        private void button3_Click(object sender, EventArgs e)
        {
            textBox3.Text = "";
            var worker = Worker.Default;
            // make it small, for we only do it in one GPU thread.
            const int rows = 10;
            const int cols = 5;
            var rng = new Random();
            var inputs = new double[rows, cols];
            for (var row = 0; row < rows; ++row)
            {
                for (var col = 0; col < cols; ++col)
                {
                    inputs[row, col] = rng.Next(1, 100);
                }
            }
            var dInputs = worker.Malloc(inputs);
            var dOutputs = worker.Malloc<double>(rows, cols);
            var lp = new LaunchParam(1, 1);
            worker.Launch(Kernel, lp, dOutputs.Ptr, dInputs.Ptr, rows, cols);
            var outputs = new double[rows, cols];
            dOutputs.Gather(outputs);
            Assert.AreEqual(inputs, outputs);
            textBox3.Text = "Ok";
        }
        static int FlattenIndex(int row, int col, int cols)
        {
            return row * cols + col;
        }

        #endregion

        #region ********************************************** Array 3 Dimension ************************************************************************

        private void button4_Click(object sender, EventArgs e)
        {
            textBox4.Text = "";
            var worker = Worker.Default;
            // make it small, for we only do it in one GPU thread.
            const int rows = 10;
            const int cols = 5;
            const int cols2 = 3;
            var rng = new Random();
            var inputs = new double[rows, cols, cols2];
            for (var row = 0; row < rows; ++row)
            {
                for (var col = 0; col < cols; ++col)
                {
                    for (var col2 = 0; col2 < cols2; ++col2)
                    {
                        inputs[row, col, col2] = rng.Next(1, 100);
                    }
                }
            }
            var dInputs = worker.Malloc(inputs);
            var dOutputs = worker.Malloc<double>(rows, cols, cols2);
            var lp = new LaunchParam(1, 1);
            worker.Launch(Kernel3D, lp, dOutputs.Ptr, dInputs.Ptr, rows, cols, cols2);
            var outputs = new double[rows, cols, cols2];
            dOutputs.Gather(outputs);
            Assert.AreEqual(inputs, outputs);
            textBox4.Text = "Ok";
        }
        static int FlattenIndex3D(int row, int col,int col2, int cols, int cols2)   
        {
            return (row * cols * cols2) + (col * cols2) + col2;
        }

        #endregion

        #region **************************************************** AOTCompile *************************************************************************

        [AOTCompile]
        static void SquareKernel(deviceptr<double> outputs, deviceptr<double> inputs, int n)
        {
            var start = blockIdx.x * blockDim.x + threadIdx.x;
            var stride = gridDim.x * blockDim.x;
            for (var i = start; i < n; i += stride)
            {
                outputs[i] = inputs[i] * inputs[i];
            }
        }

        [AOTCompile]
        static void Kernel(deviceptr<double> outputs, deviceptr<double> inputs, int rows, int cols)
        {
            // for simplicity, I do all things in one thread.
            for (var row = 0; row < rows; row++)
            {
                for (var col = 0; col < cols; col++)
                {
                    outputs[FlattenIndex(row, col, cols)] = inputs[FlattenIndex(row, col, cols)];
                }
            }
        }

        [AOTCompile]
        static void Kernel3D(deviceptr<double> outputs, deviceptr<double> inputs, int rows, int cols, int cols2)
        {
            // for simplicity, I do all things in one thread.
            for (var row = 0; row < rows; row++)
            {
                for (var col = 0; col < cols; col++)
                {
                    for (var col2 = 0; col2 < cols2; col2++)
                    {
                        outputs[FlattenIndex3D(row, col, col2, cols, cols2)] = inputs[FlattenIndex3D(row, col, col2, cols, cols2)];
                    }
                }
            }
        }
        #endregion
    }

    static class Extension
    {
        public static DeviceMemory<T> Malloc<T>(this Worker worker, T[,] array2D)
        {
            var rows = array2D.GetLength(0);
            var cols = array2D.GetLength(1);
            var dmem = worker.Malloc<T>(rows * cols);

            var handle = GCHandle.Alloc(array2D, GCHandleType.Pinned);
            try
            {
                var hostPtr = handle.AddrOfPinnedObject();
                var devicePtr = dmem.Handle;
                // we now pinned .NET array, and need to copy them with CUDA Driver API
                // to do so we need use worker.Eval to make sure the worker's context is
                // pushed onto current thread.
                worker.EvalAction(() =>
                {
                    CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyHtoD(devicePtr, hostPtr,
                        new IntPtr(Intrinsic.__sizeof<T>() * rows * cols)));
                });
            }
            finally
            {
                handle.Free();
            }

            return dmem;
        }
        public static DeviceMemory<T> Malloc<T>(this Worker worker, T[,,] array3D)
        {
            var rows = array3D.GetLength(0);
            var cols = array3D.GetLength(1);
            var cols2 = array3D.GetLength(2);
            var dmem = worker.Malloc<T>(rows * cols * cols2);

            var handle = GCHandle.Alloc(array3D, GCHandleType.Pinned);
            try
            {
                var hostPtr = handle.AddrOfPinnedObject();
                var devicePtr = dmem.Handle;
                // we now pinned .NET array, and need to copy them with CUDA Driver API
                // to do so we need use worker.Eval to make sure the worker's context is
                // pushed onto current thread.
                worker.EvalAction(() =>
                {
                    CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyHtoD(devicePtr, hostPtr,
                        new IntPtr(Intrinsic.__sizeof<T>() * rows * cols * cols2)));
                });
            }
            finally
            {
                handle.Free();
            }

            return dmem;
        }

        public static DeviceMemory<T> Malloc<T>(this Worker worker, int rows, int cols)
        {
            return worker.Malloc<T>(rows * cols);
        }

        public static DeviceMemory<T> Malloc<T>(this Worker worker, int rows, int cols, int cols2)
        {
            return worker.Malloc<T>(rows * cols * cols2);
        }

        public static void Gather<T>(this DeviceMemory<T> dmem, T[,] array2D)
        {
            var rows = array2D.GetLength(0);
            var cols = array2D.GetLength(1);

            var handle = GCHandle.Alloc(array2D, GCHandleType.Pinned);
            try
            {
                var hostPtr = handle.AddrOfPinnedObject();
                var devicePtr = dmem.Handle;
                // we now pinned .NET array, and need to copy them with CUDA Driver API
                // to do so we need use worker.Eval to make sure the worker's context is
                // pushed onto current thread.
                dmem.Worker.EvalAction(() =>
                {
                    CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyDtoH(hostPtr, devicePtr,
                        new IntPtr(Intrinsic.__sizeof<T>() * rows * cols)));
                });
            }
            finally
            {
                handle.Free();
            }
        }
        public static void Gather<T>(this DeviceMemory<T> dmem, T[,,] array3D)
        {
            var rows = array3D.GetLength(0);
            var cols = array3D.GetLength(1);
            var cols2 = array3D.GetLength(2);

            var handle = GCHandle.Alloc(array3D, GCHandleType.Pinned);
            try
            {
                var hostPtr = handle.AddrOfPinnedObject();
                var devicePtr = dmem.Handle;
                // we now pinned .NET array, and need to copy them with CUDA Driver API
                // to do so we need use worker.Eval to make sure the worker's context is
                // pushed onto current thread.
                dmem.Worker.EvalAction(() =>
                {
                    CUDAInterop.cuSafeCall(CUDAInterop.cuMemcpyDtoH(hostPtr, devicePtr,
                        new IntPtr(Intrinsic.__sizeof<T>() * rows * cols * cols2)));
                });
            }
            finally
            {
                handle.Free();
            }
        }
    }

}

翔，非常感谢你的帮助，我永远找不到那样的解决办法!!!!!这太棒了!!!!!

Kinds Regards，

灵光

我们如何使用AleaGpu将具有多个维度的数组复制到内核？

2 个答案: