Question

这个问题有两个部分，但它们密切相关：

问题1

Metal是否提供了一种利用共享线程组内存的方法？

例如，在CUDA中，您可以将设备内存中的数据显式加载到共享内存中，如下所示：

__shared__ float example1

Metal会提供这样的功能吗？似乎所有缓冲区访问都是从全局内存加载的，除非在幕后有一些隐藏的魔法。

问题2

这可能不是Metal独有的，所以任何GPU大师都可能有所帮助。 Apple提供了一个矩阵乘法示例here - 我将粘贴下面的内核以供参考：

typedef struct
{
    ushort m, k, n, pbytes, qbytes;
} MetalMatrixDim;


kernel void MatrixMultiply(const device float*       A    [[ buffer(0) ]],
                           const device float*       B    [[ buffer(1) ]],
                           device float*             C    [[ buffer(2) ]],
                           constant MetalMatrixDim&  dims [[ buffer(3) ]],
                           ushort2                   gid  [[ thread_position_in_grid ]])
{
    ushort m = dims.m;
    ushort k = dims.k;
    ushort n = dims.n;

    ushort pbytes = dims.pbytes;
    ushort qbytes = dims.qbytes;

    ushort2 gidIn = ushort2(gid.x << 3, gid.y << 3);

    if (gidIn.x >= m || gidIn.y >= k) return;

    const device float4* a = (const device float4*)(A + gidIn.x);
    const device float4* b = (const device float4*)(B + gidIn.y);

    C = (device float*)((device char*)C + gidIn.x*qbytes);

    device float4* c = (device float4*)(C + gidIn.y);

    const device float4* Bend = (const device float4*)((const device char*)B + qbytes*n);

    float4 s0  = 0.0f, s1  = 0.0f, s2  = 0.0f, s3  = 0.0f;
    float4 s4  = 0.0f, s5  = 0.0f, s6  = 0.0f, s7  = 0.0f;
    float4 s8  = 0.0f, s9  = 0.0f, s10 = 0.0f, s11 = 0.0f;
    float4 s12 = 0.0f, s13 = 0.0f, s14 = 0.0f, s15 = 0.0f;

    do
    {
        float4 aCurr0 = a[0];
        float4 aCurr1 = a[1];
        float4 bCurr0 = b[0];
        float4 bCurr1 = b[1];

        s0   += (aCurr0.x * bCurr0);
        s2   += (aCurr0.y * bCurr0);
        s4   += (aCurr0.z * bCurr0);
        s6   += (aCurr0.w * bCurr0);

        s1   += (aCurr0.x * bCurr1);
        s3   += (aCurr0.y * bCurr1);
        s5   += (aCurr0.z * bCurr1);
        s7   += (aCurr0.w * bCurr1);

        s8   += (aCurr1.x * bCurr0);
        s10  += (aCurr1.y * bCurr0);
        s12  += (aCurr1.z * bCurr0);
        s14  += (aCurr1.w * bCurr0);

        s9   += (aCurr1.x * bCurr1);
        s11  += (aCurr1.y * bCurr1);
        s13  += (aCurr1.z * bCurr1);
        s15  += (aCurr1.w * bCurr1);

        a = (device float4*)((device char*)a + pbytes);
        b = (device float4*)((device char*)b + qbytes);

    } while(b < Bend);

    c[0] = s0;  c[1] = s1;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s2;  c[1] = s3;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s4;  c[1] = s5;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s6;  c[1] = s7;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s8;  c[1] = s9;  c = (device float4*)((device char*)c + qbytes);
    c[0] = s10; c[1] = s11; c = (device float4*)((device char*)c + qbytes);
    c[0] = s12; c[1] = s13; c = (device float4*)((device char*)c + qbytes);
    c[0] = s14; c[1] = s15;
}

问题：对于每个线程，此内核计算输出C的8 x 8扇区。这是什么原因？为什么不允许每个线程计算C的单个元素，这将删除多重的8个大小限制并为较小的矩阵提供更好的并行化？

我认为这个实现必须以某种方式进行优化，而且我猜测它与线程同步和内存访问有关 - 这就是我将它与问题1捆绑在一起的原因。任何想法？

Answer 1

我没有看到你们两个问题之间的关系。关于问题1：是的，Metal在计算功能中提供共享线程组内存。只需在变量声明中指定threadgroup地址空间限定符即可。例如：

threadgroup float example1;

您还可以将线程组缓冲区指定为计算函数的输入参数。

kernel void my_func(...,
                    threadgroup float *example2 [[threadgroup(0)]],
                    ...)
{
    ...
}

缓冲区由设备分配。缓冲区的大小使用计算命令编码器的-setThreadgroupMemoryLength:atIndex:方法设置。

金属 - 优化内存访问

问题1

问题2

1 个答案: