Question

在我目前正在开发的应用程序中，我必须有效地对相当大的向量数组求和。这是我的代码：

public List<double[, ,]> normalMaps;

public double[, ,] Mix(double[] weights, double gain)
{
    int w, h;
    w = normalMaps[0].GetLength(0);
    h = normalMaps[0].GetLength(1);

    double[, ,] ret = new double[w, h, 3];
    int normcount = normalMaps.Count;

    //for (int y = 0; y < h; y++)
    Parallel.For(0, h, y =>            
    {
        for (int x = 0; x < w; x++)
        {
            for (int z = 0; z < normcount; z++) 
            {
                ret[x, y, 0] += normalMaps[z][x, y, 0] * weights[z];
                ret[x, y, 1] += normalMaps[z][x, y, 1] * weights[z];
                ret[x, y, 2] += normalMaps[z][x, y, 2] * weights[z];
            }
            ret[x, y, 0] *= gain;
            ret[x, y, 1] *= gain;
            ret[x, y, 2] *= gain;

            ret[x, y, 0] = Math.Max(-1, Math.Min(1, ret[x, y, 0]));
            ret[x, y, 1] = Math.Max(-1, Math.Min(1, ret[x, y, 1]));
            ret[x, y, 2] = Math.Max(-1, Math.Min(1, ret[x, y, 2]));

            double retnorm = Math.Sqrt(ret[x, y, 0] * ret[x, y, 0] + ret[x, y, 1] * ret[x, y, 1] + ret[x, y, 2] * ret[x, y, 2]);
            ret[x, y, 0] /= retnorm;
            ret[x, y, 1] /= retnorm;
            ret[x, y, 2] /= retnorm;

        }
    });

    return ret;
}

现在，当我尝试总计7个1024 * 1024个3分量矢量数组时，我的笔记本电脑上的操作需要320毫秒。使代码多线程化已经给我带来了巨大的性能提升。但我需要让它更快。我怎样才能进一步优化它？我已经可以看到我可以使用一个简单的数组而不是List＆lt;＆gt;，这会使代码更快，但不会太多。真的没有什么可以优化的吗？我正在考虑将这个东西转移到GPU上，但这只是一个想法。有人可以帮帮我吗？提前谢谢。

Answer 1

如果您知道正在以低效率的顺序迭代维度，则会导致代码从270毫秒到0毫秒，这会导致false sharing。您基本上是并行化“宽度”而不是高度。您可能会混淆数组在内存中的存储方式。

虚假共享不是唯一的问题，因为计算机如何工作，你正在以一种缓存效率低的方式迭代事物。

通常，数组定义应为myArray[HEIGHT, WIDTH]以使其与内存存储保持一致，并且在迭代时，height应该位于最外层。

Parallel.For(0, w, x =>            
{
    for (int y = 0; y < h; y++)
    {
       ...
    }
}

这让我从800毫秒到150毫秒，同时具有相同的尺寸，只需交换一些东西。

Answer 2

正如您所提到的，交换List＆lt;＆gt; out for a array将会显着提升性能。

如果切换到数组，还可以使用指针迭代值。为了固定它而你会受到很小的性能打击，所以它不会被GC移动，但考虑到尺寸，专业人士应该超过缺点。您可以在.NET框架的源代码中看到这一点，以便从大量迭代中挤出每一滴性能。

您可以使用新的SIMD支持进行实际计算，但我对该主题不够了解，无法提供更多详细信息。我还要提一下，.NET中的新SIMD功能还没有完全完成，仍处于测试阶段。

Answer 3

我敢打赌，如果换掉X和Y循环，你可以加倍速度：

public double[, ,] Mix(double[] weights, double gain)
{
    int w, h;
    w = normalMaps[0].GetLength(0);
    h = normalMaps[0].GetLength(1);

    double[, ,] ret = new double[w, h, 3];
    int normcount = normalMaps.Count;

    //for (int y = 0; y < h; y++)
    Parallel.For(0, w, x =>            
    {
        for (int y = 0; y < h; y++)
        {
           .
           .
           .
        }
    });

    return ret;
}

您希望最内层循环位于最后一个数组索引上，最外层循环是第一个数组索引。这导致了大多数缓存一致的方法。编译器也不必在每个数组索引查找时进行乘法运算，它只需要一个索引。（我认为可以更好地解释，如果它会有所帮助......）

编辑：我还有2个其他优化可以获得另外15％的优化。一种是做同样的改变，但是使用Z.要做到这一点，需要将Z环拉出主循环。这意味着两次查看数据，但仍然值得。另一种是消除由normalMaps [z]查找3次引起的额外查找。请确认结果是一样的：我认为可以单独执行此操作，但也许我错过了一些。

// Extract Z loop
Parallel.For(0, normcount, z =>
//for (int z = 0; z < normcount; z++)
{
    //Parallel.For(0, w, x =>
    for (int x = 0; x < w; x++)
    {
        // I don't know why the compiler isn't smart enough to do this itself but it actually matters
        double[, ,] temp = normalMaps[z]; 
        //Parallel.For(0, h, y =>
        for (int y = 0; y < h; y++)
        {
            ret[x, y, 0] += temp[x, y, 0] * weights[z];
            ret[x, y, 1] += temp[x, y, 1] * weights[z];
            ret[x, y, 2] += temp[x, y, 2] * weights[z];
        }
    };
});

Parallel.For(0, w, x =>
{
    for (int y = 0; y < h; y++)
    {
        //Parallel.For(0, normcount, z =>
        ret[x, y, 0] *= gain;
        ret[x, y, 1] *= gain;
        ret[x, y, 2] *= gain;

        ret[x, y, 0] = Math.Max(-1, Math.Min(1, ret[x, y, 0]));
        ret[x, y, 1] = Math.Max(-1, Math.Min(1, ret[x, y, 1]));
        ret[x, y, 2] = Math.Max(-1, Math.Min(1, ret[x, y, 2]));

        double retnorm = Math.Sqrt(ret[x, y, 0] * ret[x, y, 0] + ret[x, y, 1] * ret[x, y, 1] + ret[x, y, 2] * ret[x, y, 2]);
        ret[x, y, 0] /= retnorm;
        ret[x, y, 1] /= retnorm;
        ret[x, y, 2] /= retnorm;

    };
});

Answer 4

试试这个，

private double[,,] Mix(double[][,,] normalMaps, double[] weights, double gain)
{
    var w = normalMaps[0].GetLength(0);
    var h = normalMaps[0].GetLength(1);

    var result = new double[w, h, 3];
    var mapCount = normalMaps.Length;

    Parallel.For(0, w, x =>            
    {
        for (int y = 0; y < h; y++)
        {
            OneStack(
                x,
                y,
                mapCount,
                normalMaps,
                weights,
                gain,
                result));
        }
    }
    return result;
}

private static void OneStack(
    int x,
    int y,
    int mapCount,
    double[][,,] normalMaps,
    double[] weights,
    double gain,
    double[,,] result)
{
    var weight = weights[0];
    var z0 = normalMaps[0][x, y, 0] * weight;
    var z1 = normalMaps[0][x, y, 1] * weight;
    var z2 = normalMaps[0][x, y, 2] * weight;

    for (var i = 1; i < mapCount; i++)
    {
        weight = weights[i];

        z0 += normalMaps[i][x, y, 0] * weight;
        z1 += normalMaps[i][x, y, 1] * weight;
        z2 += normalMaps[i][x, y, 2] * weight;
    }

    z0 = Math.Max(-1, Math.Min(1, z0 * gain));
    z1 = Math.Max(-1, Math.Min(1, z1 * gain));
    z2 = Math.Max(-1, Math.Min(1, z2 * gain));

    var norm = Math.Sqrt(z0 * z0 + z1 * z1 + z2 * z2);

    result[x, y, 0] = z0 / norm;
    result[x, y, 1] = z1 / norm;
    result[x, y, 2] = z2 / norm;
}

我期待改进，因为涉及大型多维阵列的分配和访问次数被最小化。虽然这是以额外实例化为代价的，但我预计使用MD阵列的成本会更高。多维数组基本上在.Net中被打破。

寻找一种更快的方法来在C＃中对数组求和

4 个答案: