Question

美好的一天，

我试图实现数据传输（通过DMA引擎）与dGPU的重叠和内核的执行。

在OpenCL中，我曾经创建过两个命令队列。一种用于排队数据传输命令，另一种用于启动内核。我在Metal中尝试了相同的方法，但是即使它们之间没有依赖关系，传输和执行也总是被序列化。或者至少这是我在Metal Trace仪器的时间轴上看到的。

这是我的主持人代码：

int main(int argc, const char * argv[])
{

const unsigned int bufferLength = 100*1024*1024;

@autoreleasepool {
    NSArray<id<MTLDevice>>* devices = MTLCopyAllDevices();

    for(id<MTLDevice> device in devices)
    {
        NSLog(device.description);
    }

    id<MTLDevice> device = devices[0];

    id<MTLLibrary> library = [device newDefaultLibrary];
    id<MTLFunction> kernel = [library newFunctionWithName:@"testKernel"];

    NSError *error = NULL;
    id<MTLComputePipelineState> computePipelineState = [device newComputePipelineStateWithFunction:kernel error:&error];

    if(!computePipelineState)
    {
        NSLog(@"Failed to create compute pipeline state, error %@", error);
        return -1;
    }

    id<MTLCommandQueue> queue0 = [device newCommandQueue];
    id<MTLCommandQueue> queue1 = [device newCommandQueue];
    id<MTLCommandQueue> queue2 = [device newCommandQueue];

    id<MTLCommandQueue> writeQueue = queue0;
    id<MTLCommandQueue> execQueue  = queue2;
    id<MTLCommandQueue> readQueue  = queue1;

    id<MTLBuffer> hostInputBuffer0    = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModeShared)];
    id<MTLBuffer> hostInputBuffer1    = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModeShared)];
    id<MTLBuffer> hostOutputBuffer0   = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModeShared)];
    id<MTLBuffer> hostOutputBuffer1   = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModeShared)];
    id<MTLBuffer> deviceInputBuffer0  = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModePrivate)];
    id<MTLBuffer> deviceInputBuffer1  = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModePrivate)];
    id<MTLBuffer> deviceOutputBuffer0 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModePrivate)];
    id<MTLBuffer> deviceOutputBuffer1 = [device newBufferWithLength:bufferLength options:(MTLResourceStorageModePrivate)];

    unsigned char *pHostInputData0 = [hostInputBuffer0 contents];
    unsigned char *pHostInputData1 = [hostInputBuffer1 contents];
    for(int i = 0; i < bufferLength; i++)
    {
        pHostInputData0[i] = pHostInputData1[i] = i % 256;
    }

    //        
    // Copy H to D
    //
    id<MTLCommandBuffer> HtoDcommandBuffer0 = [writeQueue commandBuffer];
    HtoDcommandBuffer0.label = @"HtoDcommandBuffer 0";
    id<MTLBlitCommandEncoder> HtoDblitCommandEncoder0 = [HtoDcommandBuffer0 blitCommandEncoder];
    [HtoDblitCommandEncoder0 copyFromBuffer:hostInputBuffer0 sourceOffset:0 toBuffer:deviceInputBuffer0 destinationOffset:0 size:bufferLength];
    [HtoDblitCommandEncoder0 endEncoding];
    [HtoDcommandBuffer0 commit];

    id<MTLCommandBuffer> HtoDcommandBuffer1 = [writeQueue commandBuffer];
    HtoDcommandBuffer1.label = @"HtoDcommandBuffer 1";
    id<MTLBlitCommandEncoder> HtoDblitCommandEncoder1 = [HtoDcommandBuffer1 blitCommandEncoder];
    [HtoDblitCommandEncoder1 copyFromBuffer:hostInputBuffer1 sourceOffset:0 toBuffer:deviceInputBuffer1 destinationOffset:0 size:bufferLength];
    [HtoDblitCommandEncoder1 endEncoding];
    [HtoDcommandBuffer1 commit];

    //        
    // Execute kernel
    //     
    id<MTLCommandBuffer> computeCommandBuffer0 = [execQueue commandBuffer];
    computeCommandBuffer0.label = @"computeCommandBuffer 0";
    id<MTLComputeCommandEncoder> computeCommandEncoder0 = [computeCommandBuffer0 computeCommandEncoder];
    [computeCommandEncoder0 setComputePipelineState:computePipelineState];
    [computeCommandEncoder0 setBuffer:deviceInputBuffer0 offset:0 atIndex:0];
    [computeCommandEncoder0 setBuffer:deviceOutputBuffer0 offset:0 atIndex:1];
    [computeCommandEncoder0 setBytes:&bufferLength length:sizeof(bufferLength) atIndex:2];
    [computeCommandEncoder0 dispatchThreadgroups:MTLSizeMake((bufferLength + 255) / 256, 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
    [computeCommandEncoder0 endEncoding];
    [computeCommandBuffer0 commit];

    id<MTLCommandBuffer> computeCommandBuffer1 = [execQueue commandBuffer];
    computeCommandBuffer1.label = @"computeCommandBuffer 1";
    id<MTLComputeCommandEncoder> computeCommandEncoder1 = [computeCommandBuffer1 computeCommandEncoder];
    [computeCommandEncoder1 setComputePipelineState:computePipelineState];
    [computeCommandEncoder1 setBuffer:deviceInputBuffer1 offset:0 atIndex:0];
    [computeCommandEncoder1 setBuffer:deviceOutputBuffer1 offset:0 atIndex:1];
    [computeCommandEncoder1 setBytes:&bufferLength length:sizeof(bufferLength) atIndex:2];
    [computeCommandEncoder1 dispatchThreadgroups:MTLSizeMake((bufferLength + 255) / 256, 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
    [computeCommandEncoder1 endEncoding];
    [computeCommandBuffer1 commit];


    //        
    // Copy D to H
    //       
    id<MTLCommandBuffer> DtoHcommandBuffer0 = [readQueue commandBuffer];
    DtoHcommandBuffer0.label = @"DtoHcommandBuffer 0";
    id<MTLBlitCommandEncoder> DtoHblitCommandEncoder0 = [DtoHcommandBuffer0 blitCommandEncoder];
    [DtoHblitCommandEncoder0 copyFromBuffer:deviceOutputBuffer0 sourceOffset:0 toBuffer:hostOutputBuffer0 destinationOffset:0 size:bufferLength];
    [DtoHblitCommandEncoder0 endEncoding];
    [DtoHcommandBuffer0 commit];

    id<MTLCommandBuffer> DtoHcommandBuffer1 = [readQueue commandBuffer];
    DtoHcommandBuffer1.label = @"DtoHcommandBuffer 1";
    id<MTLBlitCommandEncoder> DtoHblitCommandEncoder1 = [DtoHcommandBuffer1 blitCommandEncoder];
    [DtoHblitCommandEncoder1 copyFromBuffer:deviceOutputBuffer1 sourceOffset:0 toBuffer:hostOutputBuffer1 destinationOffset:0 size:bufferLength];
    [DtoHblitCommandEncoder1 endEncoding];
    [DtoHcommandBuffer1 commit];

    [DtoHcommandBuffer1 waitUntilCompleted];

    unsigned char *pHostOutputData0 = [hostOutputBuffer0 contents];
    unsigned char *pHostOutputData1 = [hostOutputBuffer1 contents];
    for(int i = 0; i < bufferLength; i++)
    {
        if(pHostOutputData0[i] != (i+1) % 256)
            NSLog(@"Mismatch buffer0! idx: %d, expected: %d, actial: %d\n", i, (i+1) % 256, pHostOutputData0[i]);
        if(pHostOutputData1[i] != (i+1) % 256)
            NSLog(@"Mismatch buffer1! idx: %d, expected: %d, actial: %d\n", i, (i+1) % 256, pHostOutputData1[i]);
    }       
}
return 0;
}

内核：

#include <metal_stdlib>
using namespace metal;

kernel void
testKernel(device unsigned char *inBuf   [[buffer(0)]],
           device unsigned char *outBuf  [[buffer(1)]],
           constant uint &size           [[buffer(2)]],
           uint gid                      [[thread_position_in_grid]])
{
    if(gid >= size)
    {
        return;
    }

    outBuf[gid] = (inBuf[gid] + 1) % 256;
}

重叠计算和金属转移

0 个答案: