我有以下代码,我认为分配2个16kb内存的数组,用7填充一个,用4填充另一个。
然后我运行代码,发现只有前511个项目加在一起。我80%确定它与线程组/线程数的大小有关。任何想法?func test()
{
var (device, commandQueue, defaultLibrary, commandBuffer, computeCommandEncoder) = initMetal()
let alignment:Int = 0x4000
var xpointer: UnsafeMutableRawPointer? = nil
var ypointer: UnsafeMutableRawPointer? = nil
let numberOfFloats = 4096 //56
let numberOfBytes: Int = 16384
let retx = posix_memalign(&xpointer, alignment, numberOfBytes)
if retx != noErr {
let err = String(validatingUTF8: strerror(retx)) ?? "unknown error"
fatalError("Unable to allocate aligned memory: \(err).")
}
let rety = posix_memalign(&ypointer, alignment, numberOfBytes)
if rety != noErr {
let err = String(validatingUTF8: strerror(rety)) ?? "unknown error"
fatalError("Unable to allocate aligned memory: \(err).")
}
let datax = xpointer!.bindMemory(to: Float.self, capacity: numberOfFloats)
for index in 0..<numberOfFloats {
datax[index] = 7.0
}
let datay = ypointer!.bindMemory(to: Float.self, capacity: numberOfFloats)
for index in 0..<numberOfFloats {
datay[index] = 4.0
}
kernelFunction = defaultLibrary.makeFunction(name: "sigmoid")
do
{
pipelineState = try device.makeComputePipelineState(function: kernelFunction!)
}
catch
{
fatalError("Unable to create pipeline state")
}
let startTime = CFAbsoluteTimeGetCurrent()
computeCommandEncoder.setComputePipelineState(pipelineState)
var xvectorBufferNoCopy = device.makeBuffer(bytesNoCopy: xpointer!, length: numberOfBytes, options: [], deallocator: nil)
computeCommandEncoder.setBuffer(xvectorBufferNoCopy, offset: 0, at: 0)
var yvectorBufferNoCopy = device.makeBuffer(bytesNoCopy: ypointer!, length: numberOfBytes, options: [], deallocator: nil)
computeCommandEncoder.setBuffer(yvectorBufferNoCopy, offset: 0, at: 1)
var threadgroupCounts = MTLSize(width:32,height:1,depth:1)
var threadgroups = MTLSize(width:1024, height:1, depth:1)
computeCommandEncoder.dispatchThreadgroups(threadgroups, threadsPerThreadgroup: threadgroupCounts)
computeCommandEncoder.endEncoding()
commandBuffer.commit()
commandBuffer.waitUntilCompleted()
let timeElapsed = CFAbsoluteTimeGetCurrent() - startTime
print("Time elapsed for \(title): \(timeElapsed) s")
let data = ypointer!.bindMemory(to: Float.self, capacity: numberOfFloats)
for index in 0..<numberOfFloats {
print("\(index) - \(data[index])")
}
}
kernel void addTest(const device float *inVector [[ buffer(0) ]],
device float *outVector [[ buffer(1) ]],
uint id [[ thread_position_in_grid ]]) {
outVector[id] = inVector[id] + outVector[id];
}