这个问题有两个部分,但它们密切相关:
Metal是否提供了一种利用共享线程组内存的方法?
例如,在CUDA中,您可以将设备内存中的数据显式加载到共享内存中,如下所示:
__shared__ float example1
Metal会提供这样的功能吗?似乎所有缓冲区访问都是从全局内存加载的,除非在幕后有一些隐藏的魔法。
这可能不是Metal独有的,所以任何GPU大师都可能有所帮助。 Apple提供了一个矩阵乘法示例here - 我将粘贴下面的内核以供参考:
typedef struct
{
ushort m, k, n, pbytes, qbytes;
} MetalMatrixDim;
kernel void MatrixMultiply(const device float* A [[ buffer(0) ]],
const device float* B [[ buffer(1) ]],
device float* C [[ buffer(2) ]],
constant MetalMatrixDim& dims [[ buffer(3) ]],
ushort2 gid [[ thread_position_in_grid ]])
{
ushort m = dims.m;
ushort k = dims.k;
ushort n = dims.n;
ushort pbytes = dims.pbytes;
ushort qbytes = dims.qbytes;
ushort2 gidIn = ushort2(gid.x << 3, gid.y << 3);
if (gidIn.x >= m || gidIn.y >= k) return;
const device float4* a = (const device float4*)(A + gidIn.x);
const device float4* b = (const device float4*)(B + gidIn.y);
C = (device float*)((device char*)C + gidIn.x*qbytes);
device float4* c = (device float4*)(C + gidIn.y);
const device float4* Bend = (const device float4*)((const device char*)B + qbytes*n);
float4 s0 = 0.0f, s1 = 0.0f, s2 = 0.0f, s3 = 0.0f;
float4 s4 = 0.0f, s5 = 0.0f, s6 = 0.0f, s7 = 0.0f;
float4 s8 = 0.0f, s9 = 0.0f, s10 = 0.0f, s11 = 0.0f;
float4 s12 = 0.0f, s13 = 0.0f, s14 = 0.0f, s15 = 0.0f;
do
{
float4 aCurr0 = a[0];
float4 aCurr1 = a[1];
float4 bCurr0 = b[0];
float4 bCurr1 = b[1];
s0 += (aCurr0.x * bCurr0);
s2 += (aCurr0.y * bCurr0);
s4 += (aCurr0.z * bCurr0);
s6 += (aCurr0.w * bCurr0);
s1 += (aCurr0.x * bCurr1);
s3 += (aCurr0.y * bCurr1);
s5 += (aCurr0.z * bCurr1);
s7 += (aCurr0.w * bCurr1);
s8 += (aCurr1.x * bCurr0);
s10 += (aCurr1.y * bCurr0);
s12 += (aCurr1.z * bCurr0);
s14 += (aCurr1.w * bCurr0);
s9 += (aCurr1.x * bCurr1);
s11 += (aCurr1.y * bCurr1);
s13 += (aCurr1.z * bCurr1);
s15 += (aCurr1.w * bCurr1);
a = (device float4*)((device char*)a + pbytes);
b = (device float4*)((device char*)b + qbytes);
} while(b < Bend);
c[0] = s0; c[1] = s1; c = (device float4*)((device char*)c + qbytes);
c[0] = s2; c[1] = s3; c = (device float4*)((device char*)c + qbytes);
c[0] = s4; c[1] = s5; c = (device float4*)((device char*)c + qbytes);
c[0] = s6; c[1] = s7; c = (device float4*)((device char*)c + qbytes);
c[0] = s8; c[1] = s9; c = (device float4*)((device char*)c + qbytes);
c[0] = s10; c[1] = s11; c = (device float4*)((device char*)c + qbytes);
c[0] = s12; c[1] = s13; c = (device float4*)((device char*)c + qbytes);
c[0] = s14; c[1] = s15;
}
问题:对于每个线程,此内核计算输出C
的8 x 8扇区。这是什么原因?为什么不允许每个线程计算C
的单个元素,这将删除多重的8个大小限制并为较小的矩阵提供更好的并行化?
我认为这个实现必须以某种方式进行优化,而且我猜测它与线程同步和内存访问有关 - 这就是我将它与问题1捆绑在一起的原因。任何想法?
答案 0 :(得分:2)
我没有看到你们两个问题之间的关系。关于问题1:是的,Metal在计算功能中提供共享线程组内存。只需在变量声明中指定threadgroup
地址空间限定符即可。例如:
threadgroup float example1;
您还可以将线程组缓冲区指定为计算函数的输入参数。
kernel void my_func(...,
threadgroup float *example2 [[threadgroup(0)]],
...)
{
...
}
缓冲区由设备分配。缓冲区的大小使用计算命令编码器的-setThreadgroupMemoryLength:atIndex:
方法设置。