我有数组a和b都签名double [] []。
我想比较内核中这些数组的子节点(它们的长度相同)。
目前我在
时收到错误use data = this.GPUWorker.MallocArray(data)
被召唤。
抛出异常:' System.Exception'在Alea.CUDA.dll中 附加信息:主机阵列零拷贝绑定是非公共功能。
我看不出我是如何错误地使用MallocArray函数的?
let inline (?+) a b = ((b - a) / a) * 100.0
let inline change a b =
let a = a |> Array.reduce (+)
let b = b |> Array.reduce (+)
if a > 0.0 && b > 0.0 && (?+) a b >= 5.0 then a else 0.0
type GPU<'T>(target, op : Expr<'T[] -> 'T[] -> 'T>) =
inherit ILGPUModule(target)
new(target, op : Func<'T[], 'T[], 'T>) =
new GPU<'T>(target, <@ fun x y -> op.Invoke(x, y) @>)
[<Kernel;ReflectedDefinition>]
member this.Kernel (n : int) (input : deviceptr<'T[]>) (input2 : deviceptr<'T[]>) (output : deviceptr<'T>) =
let start = blockIdx.x * blockDim.x + threadIdx.x
let stride = gridDim.x * blockDim.x
let mutable i = start
// TODO this is the actual logic.
while i < n do
let a = input.[i]
let b = input2.[i]
output.[i] <- __eval(op) a b
i <- i + stride
member this.Apply(n : int, input : deviceptr<'T[]>, input2 : deviceptr<'T[]>, output : deviceptr<'T>) =
let numSm = this.GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT
let blockSize = 256
let gridSize = min (16 * numSm) (divup n blockSize)
let lp = LaunchParam(gridSize, blockSize)
this.GPULaunch <@ this.Kernel @> lp n input input2 output
/// Takes in generic array to be used by GPU.
// May need modification to support other input parameters.
member this.Apply(data : 'T[][], pattern : 'T[][]) =
// Allocate GPU memory for the data sets.
use data = this.GPUWorker.MallocArray(data)
use pattern = this.GPUWorker.MallocArray(pattern)
// Output length is likely to match the number of elements in the input array.
use output = this.GPUWorker.Malloc(data.Length)
// Execute GPU compuation.
this.Apply(data.Length, data.Ptr, pattern.Ptr, output.Ptr)
// Copy data from GPU to CPU memory.
output.Gather()
[<AOTCompile>]
type GPUModule(target) =
inherit GPU<double>(target, fun a b -> change a b)
static let instance = lazy new GPUModule(GPUModuleTarget.DefaultWorker)
static member DefaultInstance = instance.Value
答案 0 :(得分:1)
在版本2中,您不能使用锯齿状数组,因为锯齿状数组本身不是blittable。我们在下一个版本中支持它。
我认为目前您有两种选择:
如果已知锯齿状阵列的尺寸,则可以将其转换为线性阵列,并进行一些索引计算。
你必须分别分配内部数组,并将它们的指针填充到外部数组,例如:
代码:
let innerDMems = jaggedHostArray |> Array.map (fun array -> worker.Malloc array)
use outterDMem = worker.Malloc(innerDMems |> Array.map (fun dmem -> dmem.Ptr))
....
//launch kernel with outterDMem.Ptr which is deviceptr<deviceptr<T>>
....
innerDMems |> Array.iter (fun dmem -> dmem.Dispose())
然后您的签名为deviceptr<deviceptr<T>>
,就像C语言T**
一样。