美好的一天,
我试图实现数据传输(通过DMA引擎)与dGPU的重叠和内核的执行。
在OpenCL中,我曾经创建过两个命令队列。一种用于排队数据传输命令,另一种用于启动内核。 我在Metal中尝试了相同的方法,但是即使它们之间没有依赖关系,传输和执行也总是被序列化。或者至少这是我在Metal Trace仪器的时间轴上看到的。
这是我的主持人代码:
int main(int argc, const char * argv[])
{
const unsigned int bufferLength = 100*1024*1024;
@autoreleasepool {
NSArray<id<MTLDevice>>* devices = MTLCopyAllDevices();
for(id<MTLDevice> device in devices)
{
NSLog(device.description);
}
id<MTLDevice> device = devices[0];
id<MTLLibrary> library = [device newDefaultLibrary];
id<MTLFunction> kernel = [library newFunctionWithName:@"testKernel"];
NSError *error = NULL;
id<MTLComputePipelineState> computePipelineState = [device newComputePipelineStateWithFunction:kernel error:&error];
if(!computePipelineState)
{
NSLog(@"Failed to create compute pipeline state, error %@", error);
return -1;
}
id<MTLCommandQueue> queue0 = [device newCommandQueue];
id<MTLCommandQueue> queue1 = [device newCommandQueue];
id<MTLCommandQueue> queue2 = [device newCommandQueue];
id<MTLCommandQueue> writeQueue = queue0;
id<MTLCommandQueue> execQueue = queue2;
id<MTLCommandQueue> readQueue = queue1;
id<MTLBuffer> hostInputBuffer0 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModeShared)];
id<MTLBuffer> hostInputBuffer1 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModeShared)];
id<MTLBuffer> hostOutputBuffer0 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModeShared)];
id<MTLBuffer> hostOutputBuffer1 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModeShared)];
id<MTLBuffer> deviceInputBuffer0 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModePrivate)];
id<MTLBuffer> deviceInputBuffer1 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModePrivate)];
id<MTLBuffer> deviceOutputBuffer0 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModePrivate)];
id<MTLBuffer> deviceOutputBuffer1 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModePrivate)];
unsigned char *pHostInputData0 = [hostInputBuffer0 contents];
unsigned char *pHostInputData1 = [hostInputBuffer1 contents];
for(int i = 0; i < bufferLength; i++)
{
pHostInputData0[i] = pHostInputData1[i] = i % 256;
}
//
// Copy H to D
//
id<MTLCommandBuffer> HtoDcommandBuffer0 = [writeQueue commandBuffer];
HtoDcommandBuffer0.label = @"HtoDcommandBuffer 0";
id<MTLBlitCommandEncoder> HtoDblitCommandEncoder0 = [HtoDcommandBuffer0 blitCommandEncoder];
[HtoDblitCommandEncoder0 copyFromBuffer:hostInputBuffer0 sourceOffset:0 toBuffer:deviceInputBuffer0 destinationOffset:0 size:bufferLength];
[HtoDblitCommandEncoder0 endEncoding];
[HtoDcommandBuffer0 commit];
id<MTLCommandBuffer> HtoDcommandBuffer1 = [writeQueue commandBuffer];
HtoDcommandBuffer1.label = @"HtoDcommandBuffer 1";
id<MTLBlitCommandEncoder> HtoDblitCommandEncoder1 = [HtoDcommandBuffer1 blitCommandEncoder];
[HtoDblitCommandEncoder1 copyFromBuffer:hostInputBuffer1 sourceOffset:0 toBuffer:deviceInputBuffer1 destinationOffset:0 size:bufferLength];
[HtoDblitCommandEncoder1 endEncoding];
[HtoDcommandBuffer1 commit];
//
// Execute kernel
//
id<MTLCommandBuffer> computeCommandBuffer0 = [execQueue commandBuffer];
computeCommandBuffer0.label = @"computeCommandBuffer 0";
id<MTLComputeCommandEncoder> computeCommandEncoder0 = [computeCommandBuffer0 computeCommandEncoder];
[computeCommandEncoder0 setComputePipelineState:computePipelineState];
[computeCommandEncoder0 setBuffer:deviceInputBuffer0 offset:0 atIndex:0];
[computeCommandEncoder0 setBuffer:deviceOutputBuffer0 offset:0 atIndex:1];
[computeCommandEncoder0 setBytes:&bufferLength length:sizeof(bufferLength) atIndex:2];
[computeCommandEncoder0 dispatchThreadgroups:MTLSizeMake((bufferLength + 255) / 256, 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
[computeCommandEncoder0 endEncoding];
[computeCommandBuffer0 commit];
id<MTLCommandBuffer> computeCommandBuffer1 = [execQueue commandBuffer];
computeCommandBuffer1.label = @"computeCommandBuffer 1";
id<MTLComputeCommandEncoder> computeCommandEncoder1 = [computeCommandBuffer1 computeCommandEncoder];
[computeCommandEncoder1 setComputePipelineState:computePipelineState];
[computeCommandEncoder1 setBuffer:deviceInputBuffer1 offset:0 atIndex:0];
[computeCommandEncoder1 setBuffer:deviceOutputBuffer1 offset:0 atIndex:1];
[computeCommandEncoder1 setBytes:&bufferLength length:sizeof(bufferLength) atIndex:2];
[computeCommandEncoder1 dispatchThreadgroups:MTLSizeMake((bufferLength + 255) / 256, 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
[computeCommandEncoder1 endEncoding];
[computeCommandBuffer1 commit];
//
// Copy D to H
//
id<MTLCommandBuffer> DtoHcommandBuffer0 = [readQueue commandBuffer];
DtoHcommandBuffer0.label = @"DtoHcommandBuffer 0";
id<MTLBlitCommandEncoder> DtoHblitCommandEncoder0 = [DtoHcommandBuffer0 blitCommandEncoder];
[DtoHblitCommandEncoder0 copyFromBuffer:deviceOutputBuffer0 sourceOffset:0 toBuffer:hostOutputBuffer0 destinationOffset:0 size:bufferLength];
[DtoHblitCommandEncoder0 endEncoding];
[DtoHcommandBuffer0 commit];
id<MTLCommandBuffer> DtoHcommandBuffer1 = [readQueue commandBuffer];
DtoHcommandBuffer1.label = @"DtoHcommandBuffer 1";
id<MTLBlitCommandEncoder> DtoHblitCommandEncoder1 = [DtoHcommandBuffer1 blitCommandEncoder];
[DtoHblitCommandEncoder1 copyFromBuffer:deviceOutputBuffer1 sourceOffset:0 toBuffer:hostOutputBuffer1 destinationOffset:0 size:bufferLength];
[DtoHblitCommandEncoder1 endEncoding];
[DtoHcommandBuffer1 commit];
[DtoHcommandBuffer1 waitUntilCompleted];
unsigned char *pHostOutputData0 = [hostOutputBuffer0 contents];
unsigned char *pHostOutputData1 = [hostOutputBuffer1 contents];
for(int i = 0; i < bufferLength; i++)
{
if(pHostOutputData0[i] != (i+1) % 256)
NSLog(@"Mismatch buffer0! idx: %d, expected: %d, actial: %d\n", i, (i+1) % 256, pHostOutputData0[i]);
if(pHostOutputData1[i] != (i+1) % 256)
NSLog(@"Mismatch buffer1! idx: %d, expected: %d, actial: %d\n", i, (i+1) % 256, pHostOutputData1[i]);
}
}
return 0;
}
内核:
#include <metal_stdlib>
using namespace metal;
kernel void
testKernel(device unsigned char *inBuf [[buffer(0)]],
device unsigned char *outBuf [[buffer(1)]],
constant uint &size [[buffer(2)]],
uint gid [[thread_position_in_grid]])
{
if(gid >= size)
{
return;
}
outBuf[gid] = (inBuf[gid] + 1) % 256;
}