Question

我正在学习编写并发数据结构并将 ConcurrentStack 实现视为学习练习。作为起点，我使用 IlSpy 将其反编译为C＃，创建了ConcurrentStack实现的副本。我仅限于调查和使用Push和TryPop方法。

但我的实现速度明显慢于使用原始版本。

我的测试使用4个线程（在单个套接字上，4个核心CPU），每个线程针对不同的核心具有线程亲和性。每个线程执行1,000,000个循环，每个循环执行三次推送和三次弹出。运行测试多次完成所有线程的平均时间是......

ConcurrentStack =＆gt; 445ms
Push / TryPop克隆=＆gt; 670ms

所以即使代码，据我所知，两者之间的代码相同，但克隆的速度要慢50％左右。我在一次运行中运行了500次测试并取了所有运行的平均值。所以我不相信这个问题是代码的初始JIT。

为什么这些方法的副本会慢得多？

C＃实施

（为了完整起见，我提供了可用于复制结果的C＃控制台应用程序代码。对于任何有趣的人来说，看看他们是否得到与我相同的结果。）

class Program
{
    static void Main(string[] args)
    {
        int processors = Environment.ProcessorCount;
        Console.WriteLine("Processors: {0}", processors);

        List<Type> runnersT = new List<Type>() { typeof(ThreadRunnerConcurrent), 
                                                 typeof(ThreadRunnerCASStack)};
        int cycles = 500;
        foreach (Type runnerT in runnersT)
        {
            long total = 0;
            for (int i = 0; i < cycles; i++)
            {
                // Create a thread runner per processor
                List<ThreadRunner> runners = new List<ThreadRunner>();
                for (int j = 0; j < processors; j++)
                {
                    ThreadRunner runner = Activator.CreateInstance(runnerT) as ThreadRunner;
                    runner.Processor = j;
                    runners.Add(runner);
                }

                // Start each runner going
                Stopwatch sw = new Stopwatch();
                sw.Start();
                runners.ForEach((r) => r.Start());

                // Wait for all the runners to exit
                runners.ForEach((r) => r.Join());
                runners.ForEach((r) => r.Check());
                sw.Stop();

                total += sw.ElapsedMilliseconds;
            }

            Console.WriteLine("{0} Average: {1}ms", runnerT.Name, (total / cycles));
        }

        Console.WriteLine("Finished");
        Console.ReadLine();
    }
}

abstract class ThreadRunner
{
    private int _processor;
    private Thread _thread;

    public ThreadRunner()
    {
    }

    public int Processor
    {
        get { return _processor; }
        set { _processor = value; }
    }

    public void Start()
    {
        _thread = new Thread(new ParameterizedThreadStart(Run));
        _thread.Start();
    }

    public void Join()
    {
        _thread.Join();
    }

    public abstract void Check();

    protected abstract void Run(int cycles);

    private void Run(object param)
    {
        SetAffinity();
        Run(1000000);
    }

    private void SetAffinity()
    {
        #pragma warning disable 618
        int osThreadId = AppDomain.GetCurrentThreadId();
        #pragma warning restore 618

        // Set the thread's processor affinity
        ProcessThread thread = Process.GetCurrentProcess().Threads.Cast<ProcessThread>().Where(t => t.Id == osThreadId).Single();
        thread.ProcessorAffinity = new IntPtr(1L << Processor);
    }
}

class ThreadRunnerConcurrent : ThreadRunner
{
    private static ConcurrentStack<int> _stack = new ConcurrentStack<int>();

    protected override void Run(int cycles)
    {
        int ret;
        for (int i = 0; i < cycles; i++)
        {
            _stack.Push(i);
            _stack.Push(i);
            while (!_stack.TryPop(out ret)) ;
            _stack.Push(i);
            while (!_stack.TryPop(out ret)) ;
            while (!_stack.TryPop(out ret)) ;
        }
    }

    public override void Check()
    {
        if (_stack.Count > 0)
            Console.WriteLine("ThreadRunnerConcurrent has entries!");
    }
}

class ThreadRunnerCASStack : ThreadRunner
{
    private static CASStack<int> _stack = new CASStack<int>();

    protected override void Run(int cycles)
    {
        int ret;
        for (int i = 0; i < cycles; i++)
        {
            _stack.Push(i);
            _stack.Push(i);
            while (!_stack.TryPop(out ret)) ;
            _stack.Push(i);
            while (!_stack.TryPop(out ret)) ;
            while (!_stack.TryPop(out ret)) ;
        }
    }

    public override void Check()
    {
        if (_stack.Count > 0)
            Console.WriteLine("ThreadRunnerCASStack has entries!");
    }
}

class CASStack<T>
{
    private class Node
    {
        internal readonly T m_value;
        internal CASStack<T>.Node m_next;
        internal Node(T value)
        {
            this.m_value = value;
            this.m_next = null;
        }
    }

    private volatile CASStack<T>.Node m_head;

    public void Push(T item)
    {
        CASStack<T>.Node node = new CASStack<T>.Node(item);
        node.m_next = this.m_head;

        if (Interlocked.CompareExchange<CASStack<T>.Node>(ref this.m_head, node, node.m_next) == node.m_next)
            return;

        PushCore(node, node);
    }

    private void PushCore(Node head, Node tail)
    {
        SpinWait spinWait = default(SpinWait);

        do
        {
            spinWait.SpinOnce();
            tail.m_next = this.m_head;
        }
        while (Interlocked.CompareExchange<CASStack<T>.Node>(ref this.m_head, head, tail.m_next) != tail.m_next);
    }

    public bool TryPop(out T result)
    {
        CASStack<T>.Node head = this.m_head;

        if (head == null)
        {
            result = default(T);
            return false;
        }

        if (Interlocked.CompareExchange<CASStack<T>.Node>(ref this.m_head, head.m_next, head) == head)
        {
            result = head.m_value;
            return true;
        }

        return TryPopCore(out result);
    }

    private bool TryPopCore(out T result)
    {
        CASStack<T>.Node node;
        if (TryPopCore(1, out node) == 1)
        {
            result = node.m_value;
            return true;
        }
        result = default(T);
        return false;
    }

    private int TryPopCore(int count, out CASStack<T>.Node poppedHead)
    {
        SpinWait spinWait = default(SpinWait);
        int num = 1;
        Random random = new Random(Environment.TickCount & 2147483647);
        CASStack<T>.Node head;
        int num2;
        while (true)
        {
            head = this.m_head;
            if (head == null)
                break;

            CASStack<T>.Node node = head;
            num2 = 1;
            while (num2 < count && node.m_next != null)
            {
                node = node.m_next;
                num2++;
            }

            if (Interlocked.CompareExchange<CASStack<T>.Node>(ref this.m_head, node.m_next, head) == head)
                goto Block_5;

            for (int i = 0; i < num; i++)
                spinWait.SpinOnce();

            num = (spinWait.NextSpinWillYield ? random.Next(1, 8) : (num * 2));
        }
        poppedHead = null;
        return 0;
    Block_5:
        poppedHead = head;
        return num2;
    }
}
#endregion

Answer 1

ConcurrentStack<T>有一个优势，即CASStack<T>没有，即使两者的代码相同。

ConcurrentStack<T>在您安装.Net框架安装时编译的计算机上安装了ngen'd native image。您的CASStack<T>正在通过JIT进行编译，并且由于JIT必须快速，因此它不会像ngen中的AOT编译器那样执行任何优化。

我在计算机上测试了您的代码。没有你的形象，我得到了这些结果：

Processors: 4
ThreadRunnerConcurrent Average: 764ms
ThreadRunnerCASStack Average: 948ms
Finished

ngening：

Processors: 4
ThreadRunnerConcurrent Average: 778ms
ThreadRunnerCASStack Average: 742ms
Finished

使用相同的代码实现时，无法模仿ConcurrentStack的性能

1 个答案: