Question

我已经实现了一个简单函数的普通和并行版本，它可以从32bppArgb位图计算直方图。正常版本在1920x1080图像上大约需要0.03秒，而并行版本需要0.07秒。

线程开销真的很重吗？除了Parallel之外还有其他一些构造吗？这可以加速这个过程吗？我需要加快速度，因为我正在使用30fps视频。

以下是简化代码：

public sealed class Histogram
{
    public int MaxA = 0;
    public int MaxR = 0;
    public int MaxG = 0;
    public int MaxB = 0;
    public int MaxT = 0;

    public int [] A = null;
    public int [] R = null;
    public int [] G = null;
    public int [] B = null;

    public Histogram ()
    {
        this.A = new int [256];
        this.R = new int [256];
        this.G = new int [256];
        this.B = new int [256];

        this.Initialize();
    }

    public void Initialize ()
    {
        this.MaxA = 0;
        this.MaxR = 0;
        this.MaxG = 0;
        this.MaxB = 0;
        this.MaxT = 0;

        for (int i = 0; i < this.A.Length; i++)
            this.A [i] = 0;
        for (int i = 0; i < this.R.Length; i++)
            this.R [i] = 0;
        for (int i = 0; i < this.G.Length; i++)
            this.G [i] = 0;
        for (int i = 0; i < this.B.Length; i++)
            this.B [i] = 0;
    }

    public void ComputeHistogram (System.Drawing.Bitmap bitmap, bool parallel = false)
    {
        System.Drawing.Imaging.BitmapData data = null;

        data = bitmap.LockBits
        (
            new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height),
            System.Drawing.Imaging.ImageLockMode.ReadOnly,
            System.Drawing.Imaging.PixelFormat.Format32bppArgb
        );

        try
        {
            ComputeHistogram(data, parallel);
        }
        catch
        {
            bitmap.UnlockBits(data);

            throw;
        }

        bitmap.UnlockBits(data);
    }

    public void ComputeHistogram (System.Drawing.Imaging.BitmapData data, bool parallel = false)
    {
        int stride = System.Math.Abs(data.Stride);

        this.Initialize();

        if (parallel)
        {
            unsafe
            {
                System.Threading.Tasks.Parallel.For
                (
                    0,
                    data.Height,
                    new System.Threading.Tasks.ParallelOptions() { MaxDegreeOfParallelism = System.Environment.ProcessorCount },
                    y =>
                    {
                        byte* pointer = ((byte*) data.Scan0) + (stride * y);

                        for (int x = 0; x < stride; x += 4)
                        {
                            this.B [pointer [x + 0]]++;
                            this.G [pointer [x + 1]]++;
                            this.R [pointer [x + 2]]++;
                            this.A [pointer [x + 3]]++;
                        }
                    }
                );
            }
        }
        else
        {
            unsafe
            {
                for (int y = 0; y < data.Height; y++)
                {
                    byte* pointer = ((byte*) data.Scan0) + (stride * y);

                    for (int x = 0; x < stride; x += 4)
                    {
                        this.B [pointer [x + 0]]++;
                        this.G [pointer [x + 1]]++;
                        this.R [pointer [x + 2]]++;
                        this.A [pointer [x + 3]]++;
                    }
                }
            }
        }

        for (int i = 0; i < this.A.Length; i++)
            if (this.MaxA < this.A [i]) this.MaxA = this.A [i];
        for (int i = 0; i < this.R.Length; i++)
            if (this.MaxR < this.R [i]) this.MaxR = this.R [i];
        for (int i = 0; i < this.G.Length; i++)
            if (this.MaxG < this.G [i]) this.MaxG = this.G [i];
        for (int i = 0; i < this.B.Length; i++)
            if (this.MaxB < this.B [i]) this.MaxB = this.B [i];

        if (this.MaxT < this.MaxA) this.MaxT = this.MaxA;
        if (this.MaxT < this.MaxR) this.MaxT = this.MaxR;
        if (this.MaxT < this.MaxG) this.MaxT = this.MaxG;
        if (this.MaxT < this.MaxB) this.MaxT = this.MaxB;
    }
}

Answer 1

嗯，首先，你的并行循环中有一个巨大的错误：

您将有多个线程访问，递增和更新共享阵列 - 由于固有的竞争条件，仅在同一图像上多次运行示例代码会导致截然不同的结果。

但那不是你问的。

至于为什么你看到使用并行实现的性能下降，简单的答案是你可能没有在每个并行任务的主体中做足够的工作来抵消创建新任务的“旋转成本”，安排它等等。

可能更关键的是，我相信你正在通过内存中的所有跳跃来扼杀L1 / L2缓存 - 每个任务线程都会尝试将它认为需要的内容加载到缓存中，但是当您在整个地方编制索引时，您不再创建一致的访问模式，因此每次尝试访问位图缓冲区或内部数组时，您都可能会遇到缓存未命中。

还有一种同样高效的方法是在不使用不安全代码的情况下获取位图的只读数据......实际上，让我们先做到这一点：

通过调用LockBits，您可以获得指向非托管内存的指针。让我们复制一下：

System.Drawing.Imaging.BitmapData data = null;
data = bitmap.LockBits
(
    new System.Drawing.Rectangle(0, 0, bitmap.Width, bitmap.Height),
    System.Drawing.Imaging.ImageLockMode.ReadOnly,
    System.Drawing.Imaging.PixelFormat.Format32bppArgb
);

// For later usage
var imageStride = data.Stride;
var imageHeight = data.Height;

// allocate space to hold the data
byte[] buffer = new byte[data.Stride * data.Height];

// Source will be the bitmap scan data
IntPtr pointer = data.Scan0;

// the CLR marshalling system knows how to move blocks of bytes around, FAST.
Marshal.Copy(pointer, buffer, 0, buffer.Length);

// and now we can unlock this since we don't need it anymore
bitmap.UnlockBits(data);

ComputeHistogram(buffer, imageStride, imageHeight, parallel);

现在，至于竞争条件 - 你可以通过使用Interlocked调用来提高计数，以合理的性能克服这个问题（注意!!!多线程编程很难，而且它完全是可能我的解决方案并不完美！）

public void ComputeHistogram (byte[] data, int stride, int height, bool parallel = false)
{
    this.Initialize();

    if (parallel)
    {
        System.Threading.Tasks.Parallel.For
        (
            0,
            height,
            new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount },
            y =>
            {
                int startIndex = (stride * y);
                int endIndex = stride * (y+1);
                for (int x = startIndex; x < endIndex; x += 4)
                {
                    // Interlocked actions are more-or-less atomic 
                    // (caveats abound, but this should work for us)
                    Interlocked.Increment(ref this.B[data[x]]);
                    Interlocked.Increment(ref this.G[data[x+1]]);
                    Interlocked.Increment(ref this.R[data[x+2]]);
                    Interlocked.Increment(ref this.A[data[x+3]]);
                }
            }
        );
    }
    else
    {
        // the original way is ok for non-parallel, since only one
        // thread is mucking around with the data
    }

    // Sorry, couldn't help myself, this just looked "cleaner" to me
    this.MaxA = this.A.Max();
    this.MaxR = this.R.Max();
    this.MaxG = this.G.Max();
    this.MaxB = this.B.Max();
    this.MaxT = new[] { this.MaxA, this.MaxB, this.MaxG, this.MaxR }.Max();
}

那么，这对运行时行为有何影响？

不是很多，但至少并行分叉现在计算正确的结果。：）

使用非常便宜的试验台：

void Main()
{    
    foreach(var useParallel in new[]{false, true})
    {
        var totalRunTime = TimeSpan.Zero;
        var sw = new Stopwatch();
        var runCount = 10;
        for(int run=0; run < runCount; run++)
        {
            GC.Collect();
            GC.WaitForPendingFinalizers();
            GC.Collect();
            sw.Reset();
            sw.Start();
            var bmp = Bitmap.FromFile(@"c:\temp\banner.bmp") as Bitmap;
            var hist = new Histogram();
            hist.ComputeHistogram(bmp, useParallel);
            sw.Stop();
            totalRunTime = totalRunTime.Add(sw.Elapsed);
        }
        Console.WriteLine("Parallel={0}, Avg={1} ms", useParallel, totalRunTime.TotalMilliseconds / runCount);
    }
}

我得到的结果如下：

Parallel=False, Avg=1.69777 ms
Parallel=True, Avg=5.33584 ms

正如您所看到的，我们仍然没有解决您的原始问题。：）

所以让我们试着让并行工作“更好”：

让我们看看“给予更多工作”的任务是什么：

if (parallel)
{
    var batchSize = 2;
    System.Threading.Tasks.Parallel.For
    (
        0,
        height / batchSize,
        new ParallelOptions() { MaxDegreeOfParallelism = Environment.ProcessorCount },
        y =>
        {
            int startIndex = (stride * y * batchSize);
            int endIndex = startIndex + (stride * batchSize);
            for (int x = startIndex; x < endIndex; x += 4)
            {
                // Interlocked actions are more-or-less atomic 
                // (caveats abound, but this should work for us)
                Interlocked.Increment(ref this.B[data[x]]);
                Interlocked.Increment(ref this.G[data[x+1]]);
                Interlocked.Increment(ref this.R[data[x+2]]);
                Interlocked.Increment(ref this.A[data[x+3]]);
            }
        }
    );
}

结果：

Parallel=False, Avg=1.70273 ms
Parallel=True, Avg=4.82591 ms

哦，这看起来很有希望......我想知道当我们改变batchSize时会发生什么？

让我们改变我们的试验台：

void Main()
{    
    foreach(var useParallel in new[]{false, true})
    {
        for(int batchSize = 1; batchSize < 1024; batchSize <<= 1)
        {
            var totalRunTime = TimeSpan.Zero;
            var sw = new Stopwatch();
            var runCount = 10;
            for(int run=0; run < runCount; run++)
            {
                GC.Collect();
                GC.WaitForPendingFinalizers();
                GC.Collect();
                sw.Reset();
                sw.Start();
                var bmp = Bitmap.FromFile(@"c:\temp\banner.bmp") as Bitmap;
                var hist = new Histogram();
                hist.ComputeHistogram(bmp, useParallel, batchSize);
                sw.Stop();
                totalRunTime = totalRunTime.Add(sw.Elapsed);
            }
            Console.WriteLine("Parallel={0}, BatchSize={1} Avg={2} ms", useParallel, batchSize, totalRunTime.TotalMilliseconds / runCount);
        }        
    }
}

结果:(仅显示parallel = true，因为非并行不会改变）

Parallel=True, BatchSize=1 Avg=5.57644 ms
Parallel=True, BatchSize=2 Avg=5.49982 ms
Parallel=True, BatchSize=4 Avg=5.20434 ms
Parallel=True, BatchSize=8 Avg=5.1721 ms
Parallel=True, BatchSize=16 Avg=5.00405 ms
Parallel=True, BatchSize=32 Avg=4.44973 ms
Parallel=True, BatchSize=64 Avg=2.28332 ms
Parallel=True, BatchSize=128 Avg=1.39957 ms
Parallel=True, BatchSize=256 Avg=1.29156 ms
Parallel=True, BatchSize=512 Avg=1.28656 ms

我们似乎正在接近各种类型的渐近线，一旦我们批量调整64-128范围，当然你的里程可能会根据你的位图大小等而有所不同。

我希望这有帮助！从我等待生产构建完成的那一天起，这是一个有趣的分心！：）

Answer 2

创建线程有相当大的开销。执行可能比单线程版本快得多，但完成得太快，无法弥补这一初始开销。

如果你每帧都这样做，它只会减慢你的速度。

但是，如果你手动创建一个线程池，手动分配工作，并重用每个帧的线程，你可能会发现第二帧或第三帧你的代码会跳过单线程版本。

并行化直方图功能

2 个答案: