Metal 命令缓冲区内部错误:什么是内部错误(IOAF 代码 2067)?

时间:2021-06-07 21:10:51

标签: swift gpu gpgpu metal

尝试运行计算内核会导致以下消息:

Execution of the command buffer was aborted due to an error during execution. Internal Error (IOAF code 2067)

为了获得更具体的信息,我查询命令编码器的用户信息并设法提取更多详细信息。我按照来自 this video 的说明产生了以下消息:

[Metal Diagnostics] __message__: MTLCommandBuffer execution failed: The commands 
associated with the encoder were affected by an error, which may or may not have been 
caused by the commands themselves, and failed to execute in full __:::__ 
__delegate_identifier__: GPUToolsDiagnostics

API 验证和着色器验证触发的断点导致记录堆栈帧 - 而不是 GPU 回溯。断点除了上面的消息外,没有指示任何新的信息。

我在文档中找不到对提到的 IOAF 代码的任何引用。打印的附加信息没有显示任何帮助。内核非常不同,我推测这可能导致 GPU 花费太多时间来完成。这可能是罪魁祸首,但除了直觉之外,我没有任何支持。

这是该组的线程设置:

let threadExecutionWidth = pipeline.threadExecutionWidth

let threadgroupsPerGrid = MTLSize(width: (Int(pixelCount) + threadExecutionWidth - 1) / threadExecutionWidth, height: 1, depth: 1)

let threadsPerThreadgroup = MTLSize(width: threadExecutionWidth, height: 1, depth: 1)

commandEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)

GPU 命令正在提交并等待完成:

commandEncoder.endEncoding()

commandBuffer.commit()
commandBuffer.waitUntilCompleted()

这是我的应用程序端代码的完整内容:

import Metal
import Foundation
import simd

typealias Float4 = SIMD4<Float>

struct SimpleFileWriter {
  var fileHandle: FileHandle
  
  init(filePath: String, append: Bool = false) {
    if !FileManager.default.fileExists(atPath: filePath) {
      FileManager.default.createFile(atPath: filePath, contents: nil, attributes: nil)
    }
    
    fileHandle = FileHandle(forWritingAtPath: filePath)!
    if !append {
      fileHandle.truncateFile(atOffset: 0)
    }
  }
  
  func write(content: String) {
    fileHandle.seekToEndOfFile()
    guard let data = content.data(using: String.Encoding.ascii) else {
      fatalError("Could not convert \(content) to ascii data!")
    }
    fileHandle.write(data)
  }
}

var imageWidth = 480
var imageHeight = 270
var sampleCount = 16
var bounceCount = 3 

let device = MTLCreateSystemDefaultDevice()!
let library = try! device.makeDefaultLibrary(bundle: Bundle.module)
let primaryRayFunc = library.makeFunction(name: "ray_trace")!
let pipeline = try! device.makeComputePipelineState(function: primaryRayFunc)

var pixelData: [Float4] = (0..<(imageWidth * imageHeight)).map{ _ in Float4(0, 0, 0, 0)}
var pixelCount = UInt(pixelData.count)

let pixelDataBuffer = device.makeBuffer(bytes: &pixelData, length: Int(pixelCount) * MemoryLayout<Float4>.stride, options: [])!
let pixelDataMirrorPointer = pixelDataBuffer.contents().bindMemory(to: Float4.self, capacity: Int(pixelCount))
let pixelDataMirrorBuffer = UnsafeBufferPointer(start: pixelDataMirrorPointer, count: Int(pixelCount))

let commandQueue = device.makeCommandQueue()!
let commandBufferDescriptor = MTLCommandBufferDescriptor()
commandBufferDescriptor.errorOptions = MTLCommandBufferErrorOption.encoderExecutionStatus
let commandBuffer = commandQueue.makeCommandBuffer(descriptor: commandBufferDescriptor)!
let commandEncoder = commandBuffer.makeComputeCommandEncoder()!

commandEncoder.setComputePipelineState(pipeline)
commandEncoder.setBuffer(pixelDataBuffer, offset: 0, index: 0)
commandEncoder.setBytes(&pixelCount, length: MemoryLayout<Int>.stride, index: 1)
commandEncoder.setBytes(&imageWidth, length: MemoryLayout<Int>.stride, index: 2)
commandEncoder.setBytes(&imageHeight, length: MemoryLayout<Int>.stride, index: 3)
commandEncoder.setBytes(&sampleCount, length: MemoryLayout<Int>.stride, index: 4)
commandEncoder.setBytes(&bounceCount, length: MemoryLayout<Int>.stride, index: 5)


// We have to calculate the sum `pixelCount` times
// => amount of threadgroups is `resultsCount` / `threadExecutionWidth` (rounded up)
// because each threadgroup will process `threadExecutionWidth` threads
let threadExecutionWidth = pipeline.threadExecutionWidth;
let threadgroupsPerGrid = MTLSize(width: (Int(pixelCount) + threadExecutionWidth - 1) / threadExecutionWidth, height: 1, depth: 1)
// Here we set that each threadgroup should process `threadExecutionWidth` threads
// the only important thing for performance is that this number is a multiple of
// `threadExecutionWidth` (here 1 times)
let threadsPerThreadgroup = MTLSize(width: threadExecutionWidth, height: 1, depth: 1)
commandEncoder.dispatchThreadgroups(threadgroupsPerGrid, threadsPerThreadgroup: threadsPerThreadgroup)
commandEncoder.endEncoding()

commandBuffer.commit()
commandBuffer.waitUntilCompleted()

if let error = commandBuffer.error as NSError? {
  if let encoderInfo = error.userInfo[MTLCommandBufferEncoderInfoErrorKey] as? [MTLCommandBufferEncoderInfo] {
    for info in encoderInfo {
      print(info.label + info.debugSignposts.joined())
    }
  }
}

let sfw = SimpleFileWriter(filePath: "/Users/pprovins/Desktop/render.ppm")
sfw.write(content: "P3\n")
sfw.write(content: "\(imageWidth) \(imageHeight)\n")
sfw.write(content: "255\n")

for pixel in pixelDataMirrorBuffer {
  sfw.write(content: "\(UInt8(pixel.x * 255)) \(UInt8(pixel.y * 255)) \(UInt8(pixel.z * 255)) ")
}

sfw.write(content: "\n")

此外,这里是正在运行的着色器。为简洁起见,我没有包括所有函数定义:

kernel void ray_trace(device float4 *result [[ buffer(0) ]],
                        const device uint& dataLength [[ buffer(1) ]],
                        const device int& imageWidth [[ buffer(2) ]],
                        const device int& imageHeight [[ buffer(3) ]],
                        const device int& samplesPerPixel [[ buffer(4) ]],
                        const device int& rayBounces [[ buffer (5)]],
                        const uint index [[thread_position_in_grid]]) {
  
  if (index >= dataLength) {
    return;
  }
  
  const float3 origin = float3(0.0);
  const float aspect = float(imageWidth) / float(imageHeight);
  const float3 vph = float3(0.0, 2.0, 0.0);
  const float3 vpw = float3(2.0 * aspect, 0.0, 0.0);
  const float3 llc = float3(-(vph / 2.0) - (vpw / 2.0) - float3(0.0, 0.0, 1.0));
  
  float3 accumulatedColor = float3(0.0);
  thread float seed = getSeed(index, index % imageWidth, index / imageWidth);
  
  float row = float(index / imageWidth);
  float col = float(index % imageWidth);
  
  for (int aai = 0; aai < samplesPerPixel; ++aai) {
    float ranX = fract(rand(seed));
    float ranY = fract(rand(seed));
    float u = (col + ranX) / float(imageWidth - 1);
    float v = 1.0 - (row + ranY) / float(imageHeight - 1);
    Ray r(origin, llc + u * vpw + v * vph - origin);
    
    float3 color = float3(0.0);
    HitRecord hr = {0.0, 0.0, false};
    
    float attenuation = 1.0;
    for (int bounceIndex = 0; bounceIndex < rayBounces; ++bounceIndex) {
      testForHit(sceneDistance, r, hr);

      if (hr.h) {
        float3 target = hr.p + hr.n + random_f3_in_unit_sphere(seed);
        attenuation *= 0.5;
        r = Ray(hr.p, target - hr.p);
      } else {
        color = default_atmosphere_color(r) * attenuation;
        break;
      }
    }

    accumulatedColor += color / samplesPerPixel;
  }

  result[index] = float4(sqrt(accumulatedColor), 1.0);
}

奇怪的是,它偶尔会运行。将样本数更改为 16 或以上将始终导致提及 IOAF 代码。少于 16 个样本,代码将运行约 25% 的时间。样本越多,产生错误代码的可能性越大。

是否有额外的 IOAF 代码 2067?

1 个答案:

答案 0 :(得分:0)

无法使用 Metal API + Shader Validation 确定错误代码。

通过测试内核的各个部分,特定错误被缩小到导致 GPU 挂起的 while 循环。

问题本质上可以归结为如下代码:

while(true) {
 // ad infinitum
}

或者,对于上面调用 random_f3_in_unit_sphere(seed) 的代码:

while(randNum(seed) < threshold) {
  // the while loop is not "bounded"
  // in any sense. Whoops.
  ++seed;
}