Question

我已经在各种语言和实现中实现了相同的代码（在不破坏的情况下处理二十一点的方式的数量）。我注意到的一个奇怪的是，在C中调用分区函数的Python的实现实际上比用C编写的整个程序稍快一些。对于其他语言来说似乎也是如此（Ada vs Python调用Ada，Nim vs Python调用NIM）。这对我来说似乎违反直觉 - 任何想法如何可行？

代码全部在我的GitHub回购中：

https://github.com/octonion/puzzles/tree/master/blackjack

这是使用'gcc -O3 outcome.c'编译的C代码。

#include <stdio.h>

int partitions(int cards[10], int subtotal)
{
    //writeln(cards,subtotal);
    int m = 0;
    int total;
    // Hit
    for (int i = 0; i < 10; i++)
    {
        if (cards[i] > 0)
        {
            total = subtotal + i + 1;
            if (total < 21)
            {
                // Stand
                m += 1;
                // Hit again
                cards[i] -= 1;
                m += partitions(cards, total);
                cards[i] += 1;
            }
            else if (total == 21)
            {
                // Stand; hit again is an automatic bust
                m += 1;
            }
        }
    }
    return m;
}

int main(void)
{
    int deck[] =
    { 4, 4, 4, 4, 4, 4, 4, 4, 4, 16 };
    int d = 0;

    for (int i = 0; i < 10; i++)
    {
        // Dealer showing
        deck[i] -= 1;
        int p = 0;
        for (int j = 0; j < 10; j++)
        {
            deck[j] -= 1;
            int n = partitions(deck, j + 1);
            deck[j] += 1;
            p += n;
        }

        printf("Dealer showing %i partitions = %i\n", i, p);
        d += p;
        deck[i] += 1;
    }
    printf("Total partitions = %i\n", d);
    return 0;
}

这是使用'gcc -O3 -fPIC -shared -o libpartitions.so partitions.c'编译的C函数。

int partitions(int cards[10], int subtotal)
{
    int m = 0;
    int total;
    // Hit
    for (int i = 0; i < 10; i++)
    {
        if (cards[i] > 0)
        {
            total = subtotal + i + 1;
            if (total < 21)
            {
                cards[i] -= 1;
                // Stand
                m += 1;
                // Hit again
                m += partitions(cards, total);
                cards[i] += 1;
            }
            else if (total == 21)
            {
                // Stand; hit again is an automatic bust
                m += 1;
            }
        }
    }
    return m;
}

这是C函数的Python包装器：

#!/usr/bin/env python

from ctypes import *
import os

test_lib = cdll.LoadLibrary(os.path.abspath("libpartitions.so"))
test_lib.partitions.argtypes = [POINTER(c_int), c_int]
test_lib.partitions.restype = c_int

deck = ([4]*9)
deck.append(16)

d = 0

for i in xrange(10):
    # Dealer showing
    deck[i] -= 1
    p = 0
    for j in xrange(10):
        deck[j] -= 1
        nums_arr = (c_int*len(deck))(*deck)
        n = test_lib.partitions(nums_arr, c_int(j+1))
        deck[j] += 1
        p += n
    print('Dealer showing ', i,' partitions =',p)
    d += p
    deck[i] += 1

print('Total partitions =',d)

Answer 1

我认为这里的原因是GCC如何在2个案例中编译函数partitions。您可以使用outcomes比较libpartitions.so二进制可执行文件和objdump中的asm代码，以查看差异。

objdump -d -M intel <file name>

在构建共享库时，GCC不知道如何调用partitions。在C程序中，GCC确切地知道何时调用partitions（在这种情况下，导致更差的性能）。上下文中的这种差异使得GCC以不同的方式进行优化。

您可以尝试使用不同的编译器来比较结果。我已经检查过GCC 5.4和Clang 6.0。使用GCC 5.4，Python脚本运行得更快，而使用Clang，C程序运行得更快。

Python + C（略微）比纯C快

1 个答案: