C - 实现快速将许多元素推送到数组的末尾

时间:2015-04-19 04:07:32

标签: c arrays optimization c99

我有一个简单的结构来保存数组:

struct array_of_a_type {
        size_t allocated_size;
        size_t elements; /* 1-index based */
        a_type *array;
};

我想写一个简单的函数,如下所示:

bool simple_function(struct array_of_a_type *my_array, int a, int b, int c, int d)
{
    a_type new_chunk[] = {
        a,   b,   a+b, d,   c,
        c,   c,   c+d, b+d, a,
        a+c, b+c, c+d, c+d, c,
    };
    size_t size = sizeof(new_chunk) / sizeof(a_type);
    return push_to_array(my_array, new_chunk, size);
}

my_array是一个静态的全局变量。下面是push_to_array的实现。

static bool push_to_array(struct array_of_a_type *a, a_type *new_chunk, size_t size)
{
    const size_t new_size = a->elements + size;
    const size_t old_size = a->elements;
    if (new_size > a->allocated_size) {
        /* The allocated_size is most of the time big enough.
           I’ve stripped this part of code to minimum. */
        a_type *tmp = realloc(a->array, new_size * sizeof(a_type));
        if (!tmp) {
            return true;
        } else {
            a->array = tmp;
            a->allocated_size = new_size;
        }
    }
    a->elements = new_size;
    memcpy(a->array + old_size, new_chunk, size * sizeof(a_type));
    return false;
}

我的问题:
如何重写'simple_function'以使更多编译器生成直接写入目标的代码?我希望代码保持简短和灵活。

我的代码有效。不幸的是,gcc(和一个旧的clang)在堆栈上创建临时数据,然后将其复制到目标。下面是生成的x86_64汇编程序的片段。

movq    8(%rsp), %rdx
movq    %rdx, 8(%rax)
movq    16(%rsp), %rdx
movq    %rdx, 16(%rax)
movq    24(%rsp), %rdx
movq    %rdx, 24(%rax)
movq    32(%rsp), %rdx
movq    %rdx, 32(%rax)

对于AMD,汇编程序有这个:

rep movsq

新铿锵有效。我用-O3编译。

我尝试过一次添加一个元素的代码。不幸的是,有很多条件跳转来调用realloc。

7 个答案:

答案 0 :(得分:9)

为了提高效率,您需要分离用于增长数组的逻辑,并将值分配给(未使用的)插槽,以避免额外的副本(从堆栈到数组)。

为了美化代码,您可以创建一组帮助程序宏。我将假设通过" push"你的意思是"追加到阵列"。如果你真的想要" prepend",那么还需要额外的memmove()

我们假设你有

#include <stdlib.h>
#include <stdio.h>

typedef int  array_data_type;

typedef struct {
    size_t           size;
    size_t           used;
    array_data_type *item;
} array_type;

#define ARRAY_INITIALIZER { 0, 0, NULL }

void array_free(array_type *const array)
{
    free(array->item);
    array->size = 0;
    array->used = 0;
    array->item = NULL;
}

void array_init(array_type *const array)
{
    array->size = 0;
    array->used = 0;
    array->item = NULL;
}

void array_init_size(array_type *const array, const size_t size)
{
    if (!size) {
        array->size = 0;
        array->used = 0;
        array->item = NULL;
        return;
    }

    array->item = malloc(size * sizeof array->item[0]);
    if (!array->item) {
        fprintf(stderr, "array_init_size(%p, %zu): Out of memory.\n", (void *)array, size);
        exit(EXIT_FAILURE);
    }
    array->size = size;
    array->used  = 0;
}

void array_grow_to(array_type *const array, size_t size)
{
    array_data_type *temp;

    if (size < 4)
        size = 4;
    else
    if (size < 16777216) {
        size |= size >> 1;
        size |= size >> 2;
        size |= size >> 4;
        size |= size >> 8;
        size |= size >> 16;
        size++;
    } else
        size = (size | 8388607) + 8388609;

    temp = realloc(array->item, size * sizeof array->item[0]);
    if (!temp) {
        fprintf(stderr, "array_grow_to(%p, %zu): Out of memory.\n", (void *)array, size);
        exit(EXIT_FAILURE);
    }

    array->item = temp;
    array->size = size;
}

static inline array_data_type *array_grow_by(array_type *const array, size_t const count)
{
    array_data_type *retval;

    if (array->used + count > array->size)
        array_grow_to(array, array->used + count);

    retval = array->item + array->used;
    array->used += count;
    return retval;
}

我喜欢使用used表示数组中的元素数,而size表示数组为内存分配的元素数。如果您习惯使用其他名称,请进行搜索和替换。

array_grow_to()将新大小调整为至少4,或者如果小于16,777,216则调整为2的下一个幂,或者将更大的倍数调整为8,388,608。这限制了非常大的列表的已分配但未使用的内存量。

array_grow_by()确保数组有count个新元素的空间,并返回指向第一个未使用的新元素的指针。

如果您定义以下C99预处理器宏,

#define MACRO_CONCATENATE(part1, ...)   part1 ## __VA_ARGS__

#define ARRAY_SET_N(array, count, ...)  MACRO_CONCATENATE(ARRAY_SET_, count)(array, count, __VA_ARGS__)
#define ARRAY_SET_0(...)
#define ARRAY_SET_1(a, n, v)        a[n-1] = v
#define ARRAY_SET_2(a, n, v, ...)   a[n-2] = v; ARRAY_SET_1(a, n, __VA_ARGS__)
#define ARRAY_SET_3(a, n, v, ...)   a[n-3] = v; ARRAY_SET_2(a, n, __VA_ARGS__)
#define ARRAY_SET_4(a, n, v, ...)   a[n-4] = v; ARRAY_SET_3(a, n, __VA_ARGS__)
#define ARRAY_SET_5(a, n, v, ...)   a[n-5] = v; ARRAY_SET_4(a, n, __VA_ARGS__)
#define ARRAY_SET_6(a, n, v, ...)   a[n-6] = v; ARRAY_SET_5(a, n, __VA_ARGS__)
#define ARRAY_SET_7(a, n, v, ...)   a[n-7] = v; ARRAY_SET_6(a, n, __VA_ARGS__)
#define ARRAY_SET_8(a, n, v, ...)   a[n-8] = v; ARRAY_SET_7(a, n, __VA_ARGS__)
#define ARRAY_SET_9(a, n, v, ...)   a[n-9] = v; ARRAY_SET_8(a, n, __VA_ARGS__)
#define ARRAY_SET_10(a, n, v, ...)  a[n-10] = v; ARRAY_SET_9(a, n, __VA_ARGS__)
#define ARRAY_SET_11(a, n, v, ...)  a[n-11] = v; ARRAY_SET_10(a, n, __VA_ARGS__)
#define ARRAY_SET_12(a, n, v, ...)  a[n-12] = v; ARRAY_SET_11(a, n, __VA_ARGS__)
#define ARRAY_SET_13(a, n, v, ...)  a[n-13] = v; ARRAY_SET_12(a, n, __VA_ARGS__)
#define ARRAY_SET_14(a, n, v, ...)  a[n-14] = v; ARRAY_SET_13(a, n, __VA_ARGS__)
#define ARRAY_SET_15(a, n, v, ...)  a[n-15] = v; ARRAY_SET_14(a, n, __VA_ARGS__)
#define ARRAY_SET_16(a, n, v, ...)  a[n-16] = v; ARRAY_SET_15(a, n, __VA_ARGS__)
#define ARRAY_SET_17(a, n, v, ...)  a[n-17] = v; ARRAY_SET_16(a, n, __VA_ARGS__)
#define ARRAY_SET_18(a, n, v, ...)  a[n-18] = v; ARRAY_SET_17(a, n, __VA_ARGS__)
#define ARRAY_SET_19(a, n, v, ...)  a[n-19] = v; ARRAY_SET_18(a, n, __VA_ARGS__)
#define ARRAY_SET_20(a, n, v, ...)  a[n-20] = v; ARRAY_SET_19(a, n, __VA_ARGS__)
#define ARRAY_SET_21(a, n, v, ...)  a[n-21] = v; ARRAY_SET_20(a, n, __VA_ARGS__)
#define ARRAY_SET_22(a, n, v, ...)  a[n-22] = v; ARRAY_SET_21(a, n, __VA_ARGS__)
#define ARRAY_SET_23(a, n, v, ...)  a[n-23] = v; ARRAY_SET_22(a, n, __VA_ARGS__)
#define ARRAY_SET_24(a, n, v, ...)  a[n-24] = v; ARRAY_SET_23(a, n, __VA_ARGS__)
#define ARRAY_SET_25(a, n, v, ...)  a[n-25] = v; ARRAY_SET_24(a, n, __VA_ARGS__)
#define ARRAY_SET_26(a, n, v, ...)  a[n-26] = v; ARRAY_SET_25(a, n, __VA_ARGS__)
#define ARRAY_SET_27(a, n, v, ...)  a[n-27] = v; ARRAY_SET_26(a, n, __VA_ARGS__)
#define ARRAY_SET_28(a, n, v, ...)  a[n-28] = v; ARRAY_SET_27(a, n, __VA_ARGS__)
#define ARRAY_SET_29(a, n, v, ...)  a[n-29] = v; ARRAY_SET_28(a, n, __VA_ARGS__)
#define ARRAY_SET_30(a, n, v, ...)  a[n-30] = v; ARRAY_SET_29(a, n, __VA_ARGS__)
#define ARRAY_SET_31(a, n, v, ...)  a[n-31] = v; ARRAY_SET_30(a, n, __VA_ARGS__)
#define ARRAY_SET_32(a, n, v, ...)  a[n-32] = v; ARRAY_SET_31(a, n, __VA_ARGS__)
#define ARRAY_SET_33(a, n, v, ...)  a[n-33] = v; ARRAY_SET_32(a, n, __VA_ARGS__)
#define ARRAY_SET_34(a, n, v, ...)  a[n-34] = v; ARRAY_SET_33(a, n, __VA_ARGS__)
#define ARRAY_SET_35(a, n, v, ...)  a[n-35] = v; ARRAY_SET_34(a, n, __VA_ARGS__)
#define ARRAY_SET_36(a, n, v, ...)  a[n-36] = v; ARRAY_SET_35(a, n, __VA_ARGS__)
#define ARRAY_SET_37(a, n, v, ...)  a[n-37] = v; ARRAY_SET_36(a, n, __VA_ARGS__)
#define ARRAY_SET_38(a, n, v, ...)  a[n-38] = v; ARRAY_SET_37(a, n, __VA_ARGS__)
#define ARRAY_SET_39(a, n, v, ...)  a[n-39] = v; ARRAY_SET_38(a, n, __VA_ARGS__)
#define ARRAY_SET_40(a, n, v, ...)  a[n-40] = v; ARRAY_SET_39(a, n, __VA_ARGS__)
#define ARRAY_SET_41(a, n, v, ...)  a[n-41] = v; ARRAY_SET_40(a, n, __VA_ARGS__)
#define ARRAY_SET_42(a, n, v, ...)  a[n-42] = v; ARRAY_SET_41(a, n, __VA_ARGS__)
#define ARRAY_SET_43(a, n, v, ...)  a[n-43] = v; ARRAY_SET_42(a, n, __VA_ARGS__)
#define ARRAY_SET_44(a, n, v, ...)  a[n-44] = v; ARRAY_SET_43(a, n, __VA_ARGS__)
#define ARRAY_SET_45(a, n, v, ...)  a[n-45] = v; ARRAY_SET_44(a, n, __VA_ARGS__)
#define ARRAY_SET_46(a, n, v, ...)  a[n-46] = v; ARRAY_SET_45(a, n, __VA_ARGS__)
#define ARRAY_SET_47(a, n, v, ...)  a[n-47] = v; ARRAY_SET_46(a, n, __VA_ARGS__)
#define ARRAY_SET_48(a, n, v, ...)  a[n-48] = v; ARRAY_SET_47(a, n, __VA_ARGS__)
#define ARRAY_SET_49(a, n, v, ...)  a[n-49] = v; ARRAY_SET_48(a, n, __VA_ARGS__)
#define ARRAY_SET_50(a, n, v, ...)  a[n-50] = v; ARRAY_SET_49(a, n, __VA_ARGS__)
#define ARRAY_SET_51(a, n, v, ...)  a[n-51] = v; ARRAY_SET_50(a, n, __VA_ARGS__)
#define ARRAY_SET_52(a, n, v, ...)  a[n-52] = v; ARRAY_SET_51(a, n, __VA_ARGS__)
#define ARRAY_SET_53(a, n, v, ...)  a[n-53] = v; ARRAY_SET_52(a, n, __VA_ARGS__)
#define ARRAY_SET_54(a, n, v, ...)  a[n-54] = v; ARRAY_SET_53(a, n, __VA_ARGS__)
#define ARRAY_SET_55(a, n, v, ...)  a[n-55] = v; ARRAY_SET_54(a, n, __VA_ARGS__)
#define ARRAY_SET_56(a, n, v, ...)  a[n-56] = v; ARRAY_SET_55(a, n, __VA_ARGS__)
#define ARRAY_SET_57(a, n, v, ...)  a[n-57] = v; ARRAY_SET_56(a, n, __VA_ARGS__)
#define ARRAY_SET_58(a, n, v, ...)  a[n-58] = v; ARRAY_SET_57(a, n, __VA_ARGS__)
#define ARRAY_SET_59(a, n, v, ...)  a[n-59] = v; ARRAY_SET_58(a, n, __VA_ARGS__)
#define ARRAY_SET_60(a, n, v, ...)  a[n-60] = v; ARRAY_SET_59(a, n, __VA_ARGS__)
#define ARRAY_SET_61(a, n, v, ...)  a[n-61] = v; ARRAY_SET_60(a, n, __VA_ARGS__)
#define ARRAY_SET_62(a, n, v, ...)  a[n-62] = v; ARRAY_SET_61(a, n, __VA_ARGS__)
#define ARRAY_SET_63(a, n, v, ...)  a[n-63] = v; ARRAY_SET_62(a, n, __VA_ARGS__)
#define ARRAY_SET_64(a, n, v, ...)  a[n-64] = v; ARRAY_SET_63(a, n, __VA_ARGS__)

#define ARRAY_APPEND_N(array, count, ...)                           \
    do {                                                            \
        array_data_type *const _base = array_grow_by(array, count); \
        ARRAY_SET_N(_base, count, __VA_ARGS__);                     \
    } while(0)

然后您可以将简单函数编写为

void simple_function(array_type *array,
                     const array_data_type a, const array_data_type b,
                     const array_data_type c, const array_data_type d)
{
    ARRAY_APPEND_N(array, 15, a,   b,   a+b, d,   c,
                              c,   c,   c+d, b+d, a,
                              a+c, b+c, c+d, c+d, c);
}

并将其预处理(缩进除外)

void simple_function(array_type *array,
                     const array_data_type a, const array_data_type b,
                     const array_data_type c, const array_data_type d)
{
    do {
        array_data_type *const _base = array_grow_by(array, 15);
        _base[15 - 15] = a;
        _base[15 - 14] = b;
        _base[15 - 13] = a+b;
        _base[15 - 12] = d;
        _base[15 - 11] = c;
        _base[15 - 10] = c;
        _base[15 -  9] = c;
        _base[15 -  8] = c+d;
        _base[15 -  7] = b+d;
        _base[15 -  6] = a;
        _base[15 -  5] = a+c;
        _base[15 -  4] = b+c;
        _base[15 -  3] = c+d;
        _base[15 -  2] = c+d;
        _base[15 -  1] = c;
    } while(0);
}

通常编译为Intel / AMD64架构(以及支持相对寻址模式的其他架构)上的出色机器代码。在其他一些体系结构中,最好不要使_base成为常量,而是自动增量(*(_base++) = v;)。

如果实现PP_NARG()宏来计算宏参数的数量,则可以添加宏

#define ARRAY_APPEND(array, ...) ARRAY_APPEND_N(array, PP_NARG(__VA_ARGS__), __VA_ARGS__)

在这种情况下,您的功能简化为

void simple_function(array_type *array,
                     const array_data_type a, const array_data_type b,
                     const array_data_type c, const array_data_type d)
{
    ARRAY_APPEND(array, a,   b,   a+b, d,   c,
                        c,   c,   c+d, b+d, a,
                        a+c, b+c, c+d, c+d, c);
}

在某些编译器中,预处理程序宏参数的数量限制为64,这限制了单个宏可以添加到62的最大元素数。根据您使用的编译器,您可以扩展宏以支持更多的参数,但其他编译器可能会扼杀这些。

答案 1 :(得分:6)

必须完成一些代码重构。

首先你需要一个类似于push_to_array函数的辅助函数,但是这个函数只为元素分配新的内存:

static inline bool increase_size(struct array_of_a_type *a, size_t size)
{
    const size_t new_size = a->elements + size;
    if (new_size > a->allocated_size) {
        a_type *tmp = realloc(a->array, new_size * sizeof(a_type));
        if (!tmp) {
            return true;
        } else {
            a->array = tmp;
            a->allocated_size = new_size;
        }
    }
    a->elements = new_size;
    return false;
}

巧合的是,必须更改函数push_to_array以避免代码重复:

static bool push_to_array(struct array_of_a_type *a, a_type *new_chunk, size_t size)
{
    bool failed = increase_size( a , size );
    if( failed )
    {
        return failed;
    }
    memcpy(a->array + ( a->elements - size ), new_chunk, size * sizeof(a_type));
    return false;
}

现在,simple_function非常容易编写,而不使用临时数组:

bool simple_function(struct array_of_a_type *my_array, int a, int b, int c, int d)
{
    bool failed = increase_size( my_array , 15 );
    if( failed )
    {
        return failed;
    }

    size_t i = my_array->elements - 15;
    my_array->array[i] = a;
    my_array->array[i+1] = b;
    my_array->array[i+2] = a+b;
    my_array->array[i+3] = d;
    //... write the rest of the assignments
    my_array->array[i+14] = c;

    return false;
}

答案 2 :(得分:4)

您是否因为simple_function的{​​{1}}数组在堆栈中而感到生气?那是因为你使它成为一个a_type的数组,它在堆栈上创建它。你需要像这样制作数组:

[]

然后你最后可以a_type *ap = malloc(<size> * sizeof(a_type)); atype[0] = a; ...

此外,您可能希望一次将数组推送到一个成员,这样您就可以保留静态数组,然后执行此操作:

return ap

让你的push_to_array函数稍微改变一下。

可以在此处找到push for stack的实现,请注意grow函数处理重新分配:https://github.com/minshallj/my_clib/blob/master/stack.c#L24-L31您应该能够将此调整为您的数组“class”。

此外,int i; for (i = 0; i < <size>; i++) push_to_array(&my_array, new[i]); 是否是一个存在于您的计划中其他地方的全球?我没有看到它在任何地方宣布。

答案 3 :(得分:3)

malloc()固定大小的数组,而不是使用realloc()。每次realloc(),如果realloc()无法增加现有内存块,则可能会发生副本。

一种可能的解决方案是malloc()一个固定大小的数组,然后在该数组满时将其大小加倍。然后将数据复制到新加倍的数组。这将减少潜在副本的数量。

答案 4 :(得分:3)

你需要避免实际制作一个tmp数组。只有将推送例程内联到调用者中才能实现。没有办法通过除内存之外的函数调用传递可变长度列表。只有当调用者具有安全地将它们放在那里所需的所有逻辑时,才有可能将内存作为你想要它们的最终数组:即内联。

实际上,对于amd64上的clang-3.5,临时数组完全被优化,而simple_function直接写入数组末尾的最终位置。所以对memcpy的调用永远不会发生。对于gcc 4.9.2,情况并非如此。

我认为这不会在这里工作得很好,但你可以有一个两阶段的功能,检查可以内联的常见情况(如不需要realloc),否则调用完整的功能

你可以看到你通过内联变量函数获得了什么样的asm,比如bool push_many(struct array_of_a_type *a, size_t size, ...)。 Nvm,我试过这个,gcc和clang都没有内联可变函数。 Gcc确实生成了函数的自定义版本push_many.constprop.0:,但它看起来仍然比阻止调用者放置它们的堆栈中的args更慢。

static bool push_many(struct array_of_a_type *a, size_t size, ...)
{
    va_list ap;
    va_start(ap, size);

    const size_t new_size = a->elements + size;
    if (new_size > a->allocated_size) {
        a_type *tmp = realloc(a->array, new_size * sizeof(a_type));
        if (!tmp) {
            return true;
        } else {
            a->array = tmp;
        a->allocated_size = new_size;
        }
    }

    a_type *p = a->array + a->elements;  // points to spot after last used
    va_start(ap, size);
    for (int i = a->elements; i < new_size; i++) {
        p[i] = va_arg(ap, a_type); /* Increments ap to the next argument. */
    }
    va_end(ap);
//    memcpy(a->array, new_chunk, size * sizeof(a_type));
    a->elements = new_size;
    return false;
}

复制循环编译为14条指令,包括3个cmovcc条件移动。每次迭代都会复制一个int。 (typedef int a_type;)所以gcc 4.9.2 for amd64在内联可变参数函数方面毫无用处。 clang-3.5也会生成非常讨厌的代码。

或者另一种方法是使用宏来推送内联到调用函数。但GCC或C99可变参数宏不起作用;你不能在宏中迭代args,只将它们传递给像printf这样的可变函数。因此,您需要在每次宏调用中都使用check-for-realloc,而GCC必须对其进行优化。我非常确定gcc会很难优化所有的空格检查和增量计数器操作,因为写入的一次一个可能导致不同的数量调用realloc而不是将所有内容合并为一个。 (对于编译器来说,这将是一个非常难的优化,我认为。)

您可以使用PUSH4PUSH8PUSH16宏来获取大量固定数量的args。

或者你可以让你的代码变得非常脆弱,并且有一个ALLOC_SPACE_FOR_N_MORE宏/函数,然后是一个简单的NOCHECK_PUSH宏,假设有足够的空间,只是增加了计数器。 (并且希望gcc最后会在一系列推送中进行一次添加。)

答案 5 :(得分:1)

你的推送功能有一个错误。它总是覆盖数组的前面。

使用中间new_chunk数组会使编译器更加努力。更好[和更快]重构代码以在数组中分配更多空间并直接写入数组。

如果执行realloc a&#34;投机增长长度&#34>,代码会更快。被添加。这减少了realloc次呼叫的数量。也就是说,当调用realloc时,allocated_size和/或new_size会增加增长因子(例如15),以便分配比当前更多的空间>需要,期待下一个push_to_array。通过执行此操作,第二个调用可以避免realloc,因为它仍有足够的剩余空间。

我已经创建了一个显示所有这些的测试程序。我提出了四个版本,每个版本都有逐步改进。

&#34; best&#34;版本大约快2.7倍

错误:

push_to_array函数中,这是原始代码:

    // NOTE/BUG: this is broken -- it always writes the low part of the array
    a->elements = new_size;
    memcpy(a->array,new_chunk,size * sizeof(a_type));

以下是该代码需要的内容:

    // NOTE/FIX: this writes to the correct part of the array
    memcpy(a->array + a->elements,new_chunk,size * sizeof(a_type));
    a->elements = new_size;

<强>重构:

我提出了另外四个版本:

  1. 只是[最低限度]修复了错误
  2. 类似,使用new_chunk,但允许增长参数
  3. 使用增长参数并直接写入数组(即没有new_chunk
  4. 与3类似,但内联所有代码
  5. <强>代码:

    请特别注意_fix_push_fix_array_space函数之间的速度差异。

    另外,请参阅SIMPLE_INIT宏,该宏模仿您在构建new_chunk时所执行的操作。

    // pushary.c -- test program
    
    // pushary.h -- push to array
    
    #ifndef _pushary_h_
    #define _pushary_h_
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <unistd.h>
    #include <string.h>
    #include <errno.h>
    #include <time.h>
    #include <sys/types.h>
    #include <sys/wait.h>
    
    #define bool    int
    #define true    1
    #define false   0
    
    #ifndef MCHECK
    #define MCHECK  0
    #endif
    
    #ifndef ABIG
    #define ABIG    0
    #endif
    
    #ifndef NOMACRO
    #define NOMACRO 0
    #endif
    
    #ifndef NODATA1
    #define NODATA1 0
    #endif
    
    #ifndef NODATA2
    #define NODATA2 0
    #endif
    
    #ifndef NODATA3
    #define NODATA3 0
    #endif
    
    #ifndef NODATA4
    #define NODATA4 0
    #endif
    
    #define sysfault(_fmt...) \
        do { \
            printf(_fmt); \
            exit(1); \
        } while (0)
    
    #if MCHECK
    #include <mcheck.h>
    #define MCHECKALL       mcheck_check_all()
    #else
    #define MCHECKALL       /**/
    #endif
    
    #if ABIG
    typedef long long a_type;
    #else
    typedef int a_type;
    #endif
    
    // macro for simple_function
    // NOTE: different functions could have different macros of this form
    #define SIMPLE_INIT(_cmd,_a,_b,_c,_d) \
        _cmd(_a) _cmd(_b) _cmd(_a + _b) _cmd(_d) _cmd(_c) \
        _cmd(_c) _cmd(_c) _cmd(_c + _d) _cmd(_b + _d) _cmd(_a) \
        _cmd(_a + _c) _cmd(_b + _c) _cmd(_c + _d) _cmd(_c + _d) _cmd(_c)
    
    #define _SIZE(_val) \
        + 1
    
    #define _SET(_val) \
        ptr[idx++] = _val;
    
    struct array_of_a_type {
        const char *sym;
        size_t allocated_size;
        size_t grow_size;                   // amount to grow on realloc
        size_t elements;                    // 1-index based
        a_type *array;
        double elap;                        // elapsed time
        double rate;                        // rate
    };
    typedef struct array_of_a_type a_list;
    
    typedef bool (*simple_p)(a_list *ary,int a,int b,int c,int d);
    
    #if 0
    #define INLINE  static inline
    #else
    #define INLINE  __attribute__((__always_inline__)) static inline
    #endif
    
    // test control
    typedef struct tstctl tstctl_t;
    struct tstctl {
        tstctl_t *tst_next;                 // tstorder linkage
        const char *tst_tag;                // test name
        simple_p tst_proc;                  // simple function
        double tst_bestrat;                 // best ratio
        int tst_bestgrow;                   // best growlen
        double tst_elap;                    // current/best elapsed time
        double tst_rate;                    // current rate
        int tst_trybest;                    // best trial
        a_list tst_lst;                     // array/list
    };
    
    // _fix_push -- original push (with bug fix)
    INLINE bool
    _fix_push(a_list *a,a_type *new_chunk,size_t size)
    {
        const size_t new_size = a->elements + size;
    
        if (new_size > a->allocated_size) {
            a_type *tmp = realloc(a->array,new_size * sizeof(a_type));
    
            if (!tmp) {
                sysfault("_fix_push: realloc error -- %s\n",strerror(errno));
                return true;
            }
            else {
                a->array = tmp;
                a->allocated_size = new_size;
            }
        }
    
        // NOTE/FIX: this writes to the correct part of the array
        memcpy(a->array + a->elements,new_chunk,size * sizeof(a_type));
        a->elements = new_size;
    
        return false;
    }
    
    // _fix_array_space -- allocate space in array
    // RETURNS: pointer to place to store (or NULL)
    INLINE a_type *
    _fix_array_space(a_list *a,size_t count)
    {
        size_t new_size = a->elements + count;
        size_t newmax;
        a_type *tmp;
    
        newmax = a->allocated_size;
    
        if (new_size > newmax) {
            // prevent us from doing realloc on every push
            // NOTE: grow_size is arbitrary -- pick any optimal value
            newmax += new_size;
            newmax += a->grow_size;
    
            tmp = realloc(a->array,newmax * sizeof(a_type));
            if (tmp == NULL) {
                sysfault("_fix_array_space: realloc error -- %s\n",strerror(errno));
                return tmp;
            }
    
            a->array = tmp;
            a->allocated_size = newmax;
        }
    
        tmp = a->array + a->elements;
        a->elements = new_size;
    
        return tmp;
    }
    
    // /home/cae/OBJ/ovrgen/pushary/pushary.proto -- prototypes
    
    // FILE: /home/cae/preserve/ovrbnc/pushary/com.c
    // com.c -- common routines
    
        // fix_array_space -- allocate space in array
        // RETURNS: pointer to place to store (or NULL)
        a_type *
        fix_array_space(a_list *a,size_t count);
    
        // fix_push -- original push (with bug fix)
        bool
        fix_push(a_list *a,a_type *new_chunk,size_t size);
    
    // FILE: /home/cae/preserve/ovrbnc/pushary/fix1.c
    // fix1.c -- push to array
    //
    // fixes bug in orig
    
        bool
        fix1_simple(a_list *my_array,int a,int b,int c,int d);
    
    // FILE: /home/cae/preserve/ovrbnc/pushary/fix2.c
    // fix2.c -- push to array
    //
    // uses new_chunk array
    // uses push function
    // uses grow length
    
        bool
        fix2_simple(a_list *my_array,int a,int b,int c,int d);
    
        bool
        fix2_push(a_list *a,a_type *new_chunk,size_t size);
    
    // FILE: /home/cae/preserve/ovrbnc/pushary/fix3.c
    // fix3.c -- push to array
    //
    // uses grow length
    // uses non-inline space function
    
        bool
        fix3_simple(a_list *my_array,int a,int b,int c,int d);
    
    // FILE: /home/cae/preserve/ovrbnc/pushary/fix4.c
    // fix4.c -- push to array
    //
    // uses grow length
    // uses inline space function
    
        bool
        fix4_simple(a_list *my_array,int a,int b,int c,int d);
    
    // FILE: /home/cae/preserve/ovrbnc/pushary/orig.c
    // orig.c -- push to array
    
        bool
        orig_simple(a_list *my_array,int a,int b,int c,int d);
    
        bool
        orig_push(a_list *a,a_type *new_chunk,size_t size);
    
    // FILE: /home/cae/preserve/ovrbnc/pushary/pushary.c
    // pushary.c -- test program
    
        // main -- main program
        int
        main(int argc,char **argv);
    
        // usage -- show usage
        void
        usage(void);
    
        // gendata -- generate data
        void
        gendata(void);
    
        // defall -- define all tests
        void
        defall(void);
    
        // defone -- define all tests
        tstctl_t *
        defone(simple_p proc,const char *tag);
    
        // testall -- test all
        void
        testall(void);
    
        // testone -- test a function
        void
        testone(tstctl_t *tst);
    
        // _testone -- test a function
        void
        _testone(tstctl_t *tst,int trycnt,double *elap);
    
        // ratshow -- show ratio
        void
        ratshow(tstctl_t *tlhs,tstctl_t *trhs,int bestflg);
    
        // arycmp -- compare arrays
        void
        arycmp(tstctl_t *tlhs,tstctl_t *trhs);
    
        // arykill -- release an array
        void
        arykill(tstctl_t *tst);
    
        // tvsecf -- get hi-res time
        double
        tvsecf(void);
    
    #endif
    
    // orig.c -- push to array
    
    bool
    orig_simple(a_list *my_array,int a,int b,int c,int d)
    {
        a_type new_chunk[] = {
            a, b, a + b, d, c,
            c, c, c + d, b + d, a,
            a + c, b + c, c + d, c + d, c,
        };
        size_t size = sizeof(new_chunk) / sizeof(a_type);
    
        return orig_push(my_array,new_chunk,size);
    }
    
    bool
    orig_push(a_list *a,a_type *new_chunk,size_t size)
    {
        const size_t new_size = a->elements + size;
    
        if (new_size > a->allocated_size) {
            a_type *tmp = realloc(a->array,new_size * sizeof(a_type));
    
            if (!tmp) {
                return true;
            }
            else {
                a->array = tmp;
                a->allocated_size = new_size;
            }
        }
    
        // NOTE/BUG: this is broken -- it always writes the low part of the array
        a->elements = new_size;
        memcpy(a->array,new_chunk,size * sizeof(a_type));
    
        return false;
    }
    // fix1.c -- push to array
    //
    // fixes bug in orig
    
    bool
    fix1_simple(a_list *my_array,int a,int b,int c,int d)
    {
        a_type new_chunk[] = {
            a, b, a + b, d, c,
            c, c, c + d, b + d, a,
            a + c, b + c, c + d, c + d, c,
        };
        size_t size = sizeof(new_chunk) / sizeof(a_type);
    
    #if NODATA1 == 0
        return fix_push(my_array,new_chunk,size);
    #endif
    }
    // fix2.c -- push to array
    //
    // uses new_chunk array
    // uses push function
    // uses grow length
    
    bool
    fix2_simple(a_list *my_array,int a,int b,int c,int d)
    {
        a_type new_chunk[] = {
            a, b, a + b, d, c,
            c, c, c + d, b + d, a,
            a + c, b + c, c + d, c + d, c,
        };
        size_t size = sizeof(new_chunk) / sizeof(a_type);
    
        return fix2_push(my_array,new_chunk,size);
    }
    
    bool
    fix2_push(a_list *a,a_type *new_chunk,size_t size)
    {
        a_type *tmp;
    
        tmp = fix_array_space(a,size);
        if (tmp == NULL)
            return true;
    
        // NOTE/FIX: this writes to the correct part of the array
    #if NODATA2 == 0
        memcpy(tmp,new_chunk,size * sizeof(a_type));
    #endif
    
        return false;
    }
    // fix3.c -- push to array
    //
    // uses grow length
    // uses non-inline space function
    
    bool
    fix3_simple(a_list *my_array,int a,int b,int c,int d)
    {
    #if NOMACRO
        size_t count = 15;
    #else
        size_t count = SIMPLE_INIT(_SIZE,1,2,3,4);
    #endif
        a_type *ptr;
    
        // use non-inline function
        ptr = fix_array_space(my_array,count);
        if (ptr == NULL)
            return true;
    
        // NOTE: these optimize to _exactly_ the same code
    #if NODATA3 == 0
    #if NOMACRO
        ptr[0] = a;
        ptr[1] = b;
        ptr[2] = a + b;
        ptr[3] = d;
        ptr[4] = c;
        ptr[5] = c;
        ptr[6] = c;
        ptr[7] = c + d;
        ptr[8] = b + d;
        ptr[9] = a;
        ptr[10] = a + c;
        ptr[11] = b + c;
        ptr[12] = c + d;
        ptr[13] = c + d;
        ptr[14] = c;
    #else
        int idx = 0;
        SIMPLE_INIT(_SET,a,b,c,d)
    #endif
    #endif
    
        return false;
    }
    // fix4.c -- push to array
    //
    // uses grow length
    // uses inline space function
    
    bool
    fix4_simple(a_list *my_array,int a,int b,int c,int d)
    {
    #if NOMACRO
        size_t count = 15;
    #else
        size_t count = SIMPLE_INIT(_SIZE,1,2,3,4);
    #endif
        a_type *ptr;
    
        // use inline function
        ptr = _fix_array_space(my_array,count);
        if (ptr == NULL)
            return true;
    
        // NOTE: these optimize to _exactly_ the same code
    #if NODATA4 == 0
    #if NOMACRO
        ptr[0] = a;
        ptr[1] = b;
        ptr[2] = a + b;
        ptr[3] = d;
        ptr[4] = c;
        ptr[5] = c;
        ptr[6] = c;
        ptr[7] = c + d;
        ptr[8] = b + d;
        ptr[9] = a;
        ptr[10] = a + c;
        ptr[11] = b + c;
        ptr[12] = c + d;
        ptr[13] = c + d;
        ptr[14] = c;
    #else
        int idx = 0;
        SIMPLE_INIT(_SET,a,b,c,d)
    #endif
    #endif
    
        return false;
    }
    // com.c -- common routines
    
    // fix_array_space -- allocate space in array
    // RETURNS: pointer to place to store (or NULL)
    a_type *
    fix_array_space(a_list *a,size_t count)
    {
    
        return _fix_array_space(a,count);
    }
    
    // fix_push -- original push (with bug fix)
    bool
    fix_push(a_list *a,a_type *new_chunk,size_t size)
    {
    
        return _fix_push(a,new_chunk,size);
    }
    
    int opt_f;
    int opt_m;
    int opt_o;
    int opt_D;
    int opt_M;
    int opt_G;
    int opt_s;
    
    #define MDFT 1000000
    
    int growlen;
    int growmin;
    int growmax;
    int growbest;
    
    double ratbest;
    
    int datamax;
    a_type *testdata;
    
    tstctl_t *orig1;
    tstctl_t *fix1;
    tstctl_t *fix2;
    tstctl_t *fix3;
    tstctl_t *fix4;
    tstctl_t *orig2;
    tstctl_t *orig3;
    
    tstctl_t *tstref;
    tstctl_t *tstorder;
    
    // main -- main program
    int
    main(int argc,char **argv)
    {
        char *cp;
        pid_t pid;
    
        --argc;
        ++argv;
    
        opt_G = -25;
        opt_f = 1;
        opt_M = MDFT;
    
        for (;  argc > 0;  --argc, ++argv) {
            cp = *argv;
            if (*cp != '-')
                break;
    
            switch (cp[1]) {
            case 'f':
                opt_f = ! opt_f;
                break;
    
            case 'o':
                cp += 2;
                opt_o = (*cp != 0) ? atoi(cp) : 2;
                break;
    
            case 'D':
                opt_D = ! opt_D;
                break;
    
            case 'm':
    #if MCHECK == 0
                usage();
    #endif
                opt_m = ! opt_m;
                break;
    
            case 'M':
                cp += 2;
                opt_M = atoi(cp);
                break;
    
            case 'G':
                cp += 2;
                opt_G = (*cp != 0) ? atoi(cp) : 25;
                break;
    
            case 's':
                cp += 2;
                opt_s = (*cp != 0) ? atoi(cp) : 3;
                break;
    
            default:
                usage();
                break;
            }
        }
    
        if (! opt_M)
            opt_M = MDFT;
        printf("M=%d\n",opt_M);
        datamax = opt_M * 4;
    
        printf("D=%d\n",opt_D);
        gendata();
    
        if (opt_G < 0) {
            growmin = 0;
            growmax = -opt_G;
        }
        else {
            growmin = opt_G;
            growmax = opt_G;
        }
    
        growlen = growmin;
    
        printf("f=%d\n",opt_f);
    
        if (opt_s <= 0)
            opt_s = 1;
        printf("s=%d\n",opt_s);
    
        defall();
    
        for (growlen = growmin;  growlen <= growmax;  ++growlen) {
            if (! opt_f) {
                testall();
                continue;
            }
    
            fflush(stdout);
            fflush(stderr);
    
            pid = fork();
    
            if (pid < 0) {
                perror("fork");
                exit(1);
            }
    
            if (pid == 0) {
                testall();
                exit(0);
            }
    
            waitpid(pid,NULL,0);
        }
    
        return 0;
    }
    
    // usage -- show usage
    void
    usage(void)
    {
    
        printf("  -f -- invert fork mode (DEFAULT: %s)\n",opt_f ? "on" : "off");
        printf("  -D -- use real random test data (DEFAULT: off)\n");
        printf("  -G[grow_length] -- array speculative grow length (DEFAULT: %d)\n",opt_G);
        printf("    <0 -- benchmark mode range\n");
        printf("    >=0 -- single grow length with data compare\n");
        printf("  -M[push_calls] -- number of times to push to array (DEFAULT: %d)\n",
            MDFT);
        printf("  -s<subtrials> -- (DEFAULT: 1)\n");
        printf("  -o<speed reference> -- (DEFAULT: 0)\n");
        printf("    0 -- use fix1\n");
        printf("    1 -- use orig (1st invocation)\n");
        printf("    2 -- use orig (2nd invocation)\n");
        printf("  -m -- force/test mcheck failure%s\n",
            MCHECK ? "" : " (requires rebuild with -DMCHECK=1 and -lmcheck)");
    
        exit(1);
    }
    
    // gendata -- generate data
    void
    gendata(void)
    {
        int *ptr;
        int idx;
    
        if (opt_D || opt_m) {
            MCHECKALL;
            testdata = malloc(sizeof(a_type) * datamax);
    
            // force an mcheck exception
            if (opt_m) {
                ptr = testdata;
                ptr -= 10;
                for (idx = 0;  idx < 20;  ++idx)
                    ptr[idx] = rand();
            }
            else {
                for (idx = 0;  idx < datamax;  ++idx)
                    testdata[idx] = rand();
            }
    
            MCHECKALL;
        }
    }
    
    // defall -- define all tests
    void
    defall(void)
    {
    
        orig1 = defone(orig_simple,"org1");
        fix1 = defone(fix1_simple,"fix1");
        fix2 = defone(fix2_simple,"fix2");
        fix3 = defone(fix3_simple,"fix3");
        fix4 = defone(fix4_simple,"fix4");
        orig2 = defone(orig_simple,"org2");
        orig3 = defone(orig_simple,"org3");
    
        switch (opt_o) {
        case 1:
            tstref = orig1;
            break;
        case 2:
            tstref = orig2;
            break;
        default:
            opt_o = 0;
            tstref = fix1;
        }
    
        printf("reference test is %s\n",tstref->tst_tag);
    }
    
    // defone -- define all tests
    tstctl_t *
    defone(simple_p proc,const char *tag)
    {
        tstctl_t *tst;
    
        tst = calloc(1,sizeof(tstctl_t));
        tst->tst_tag = tag;
        tst->tst_proc = proc;
    
        tst->tst_bestrat = 0;
    
        return tst;
    }
    
    // testall -- test all
    void
    testall(void)
    {
        tstctl_t *base;
        tstctl_t *trhs;
        tstctl_t *tlhs;
    
        printf("\n");
        printf("G=%d\n",growlen);
    
        tstorder = NULL;
    
        // perform tests
        testone(orig1);
        testone(fix1);
        testone(orig2);
        testone(fix2);
        testone(orig3);
        testone(fix3);
        testone(fix4);
    
        // show benchmarks
        for (trhs = tstorder;  trhs != NULL;  trhs = trhs->tst_next)
            ratshow(tstref,trhs,1);
    
    #if 0
        do {
            if (opt_o)
                break;
    
            if (base == fix1)
                break;
            base = fix1;
    
            ratshow(base,fix2,0);
            ratshow(base,fix3,0);
            ratshow(base,fix4,0);
        } while (0);
    #endif
    
        // compare data
        if (opt_G >= 0) {
            base = fix1;
            for (trhs = tstorder;  trhs != NULL;  trhs = trhs->tst_next)
                arycmp(base,trhs);
        }
    
        // release all array memory
        for (tlhs = tstorder;  tlhs != NULL;  tlhs = trhs) {
            trhs = tlhs->tst_next;
            arykill(tlhs);
        }
    }
    
    // testone -- test a function
    void
    testone(tstctl_t *tst)
    {
        a_list *ary;
        int trycnt;
        double elapv[opt_s];
        tstctl_t *cur;
        tstctl_t *prev;
    
        tst->tst_elap = 1e20;
    
        ary = &tst->tst_lst;
        memset(ary,0,sizeof(a_list));
    
        ary->sym = tst->tst_tag;
        ary->grow_size = growlen;
    
        for (trycnt = 0;  trycnt < opt_s;  ++trycnt)
            _testone(tst,trycnt,&elapv[trycnt]);
    
        prev = NULL;
        for (cur = tstorder;  cur != NULL;  cur = cur->tst_next)
            prev = cur;
        if (prev != NULL)
            prev->tst_next = tst;
        else
            tstorder = tst;
    }
    
    // _testone -- test a function
    void
    _testone(tstctl_t *tst,int trycnt,double *elap)
    {
        simple_p func;
        double tvbeg;
        double tvdif;
        a_list *ary;
    
        ary = &tst->tst_lst;
        arykill(tst);
    
        func = tst->tst_proc;
    
        MCHECKALL;
    
        tvbeg = tvsecf();
    
        // use real test data -- good for comparisons
        if (opt_D) {
            a_type *ptr = testdata;
            a_type *ptre = ptr + datamax;
            for (;  ptr < ptre;  ptr += 4)
                func(ary,ptr[0],ptr[1],ptr[2],ptr[3]);
        }
    
        // use the same test data -- faster and gives truer benchmark for function
        // being tested
        else {
            for (int loopcnt = datamax;  loopcnt > 0;  loopcnt -= 4)
                func(ary,1,2,3,4);
        }
    
        tvdif = tvsecf();
        tvdif -= tvbeg;
    
        MCHECKALL;
    
        ary->elap = tvdif;
        ary->rate = ary->elements;
        ary->rate /= tvdif;
    
        if (ary->elap < tst->tst_elap) {
            tst->tst_elap = ary->elap;
            tst->tst_rate = ary->rate;
            tst->tst_trybest = trycnt;
        }
    
        *elap = tvdif;
    }
    
    // ratshow -- show ratio
    void
    ratshow(tstctl_t *tlhs,tstctl_t *trhs,int bestflg)
    {
        double ratio;
        double rhsrate;
        double lhsrate;
        int faster;
    
        printf("%s %.9f",trhs->tst_tag,trhs->tst_elap);
    
        lhsrate = tlhs->tst_rate;
        rhsrate = trhs->tst_rate;
    
        faster = (rhsrate > lhsrate);
    
        if (faster)
            ratio = rhsrate / lhsrate;
        else
            ratio = lhsrate / rhsrate;
    
        if (tlhs != trhs)
            printf(" is %.3fx %s",
                ratio,faster ? "faster" : "slower");
    
        do {
            if (! bestflg)
                break;
    
            if (! faster)
                ratio = -ratio;
    
            if (ratio <= trhs->tst_bestrat)
                break;
    
            trhs->tst_bestrat = ratio;
            trhs->tst_bestgrow = growlen;
    
            //printf(" BETTER (G=%d)",growlen);
        } while (0);
    
        printf("\n");
    }
    
    // arycmp -- compare arrays
    void
    arycmp(tstctl_t *tlhs,tstctl_t *trhs)
    {
        a_list *alhs = &tlhs->tst_lst;
        a_list *arhs = &trhs->tst_lst;
        a_type lhs;
        a_type rhs;
        int matchflg;
    
        do {
            if (alhs->array == NULL)
                break;
            if (arhs->array == NULL)
                break;
    
            if (alhs->elements != arhs->elements) {
                printf("arycmp: count mismatch -- %s=%lu %s=%lu\n",
                    alhs->sym,alhs->elements,arhs->sym,arhs->elements);
                break;
            }
    
            matchflg = 1;
            for (size_t idx = 0;  idx < alhs->elements;  ++idx) {
                lhs = alhs->array[idx];
                rhs = arhs->array[idx];
                if (lhs != rhs) {
                    printf("arycmp: data mismatch -- idx=%lu %s=%d %s=%d\n",
                        idx,alhs->sym,lhs,arhs->sym,rhs);
                    matchflg = 0;
                    break;
                }
            }
    
            if (matchflg)
                printf("%s: MATCH\n",arhs->sym);
        } while (0);
    }
    
    // arykill -- release an array
    void
    arykill(tstctl_t *tst)
    {
        a_list *ary;
    
        ary = &tst->tst_lst;
    
        if (ary->array != NULL) {
            MCHECKALL;
            free(ary->array);
            MCHECKALL;
        }
    
        ary->array = NULL;
    
        ary->allocated_size = 0;
        ary->elements = 0;
    }
    
    // tvsecf -- get hi-res time
    double
    tvsecf(void)
    {
        struct timespec ts;
        double sec;
    
        clock_gettime(CLOCK_REALTIME,&ts);
        sec = ts.tv_nsec;
        sec /= 1e9;
        sec += ts.tv_sec;
    
        return sec;
    }
    

    <强>基准:

    各种方法的基准输出。它太大了,不适合这个答案,所以我将在下面的第二个发布它

答案 6 :(得分:0)

注意:由于篇幅限制,这个答案是我原来答案的延续(例如,如果是upvoting,请使用上面的原文:https://stackoverflow.com/a/39562827/5382650

<强>基准:

由于原始代码中的错误,它会使基准测试结果出现偏差,因为它会人为地获得更好的缓存性能,因为它遍历整个数组。也就是说,它似乎比修复错误时的实际效果要好。

因此,使用fix1将是基线性能的更好指标,这就是下面的数据所使用的。原始[没有修复]产生0.02但fix1产生0.06,我认为这是更正确的数字。

增长因子是一个调整参数,并且具有最好的&#34;价值,fix4版本提高了2.7倍。这就是我认为最值得信赖的结果。

然而,尽管进行了大量的单元测试,更长的测试,mcheck(3)的应用等,但数据中存在异常,我无法解释。我保留了原始算法[错误]作为测试的一部分。如果原始版本是第一次测试运行,或者是在fix1之后运行,则会产生&#34;偏斜的&#34;结果

但是,如果原件在 fix2fix3fix4之后运行有时它会产生10x 相对于自身更差的表现。增长值原始使用。但是,原始行为似乎取决于早期算法使用的推测增长因素。

有时,原来在&#34; dicey&#34;槽给出偏斜/人为的低值(约0.02)。当它变为&#34; haywire&#34;时,它会慢10倍(大约0.2)。

似乎有一阵运气不好&#34;和#34;祝你好运&#34;有了这个。如果-G选项被赋予不同的值(例如-G-300,它将测试0到300之间的所有增长值),则存在多个运行。

我认为不稳定的结果并不相关,但无论如何我都保留了它们。它可能只是噪声,即值是可以的,并且由于内存重新分配器中的某些内部导致它进行更多内部空闲块拆分/合并等而波动。

AFAICT,由于realloc区域超出范围而,因为该程序具有执行mcheck的模式,并且该模式为所有内容提供了干净的健康状况。

M=1000000
D=0
f=1
s=1
reference test is fix1

G=0
org1 0.028413773 is 2.462x faster
fix1 0.069955111
org2 0.035362244 is 1.978x faster
fix2 0.032926321 is 2.125x faster
org3 0.268535376 is 3.839x slower
fix3 0.026652813 is 2.625x faster
fix4 0.025245905 is 2.771x faster

G=1
org1 0.027498960 is 2.517x faster
fix1 0.069201946
org2 0.033916712 is 2.040x faster
fix2 0.031118631 is 2.224x faster
org3 0.264514446 is 3.822x slower
fix3 0.026646614 is 2.597x faster
fix4 0.025324345 is 2.733x faster

G=2
org1 0.026978731 is 2.496x faster
fix1 0.067343950
org2 0.034334421 is 1.961x faster
fix2 0.031268835 is 2.154x faster
org3 0.266630888 is 3.959x slower
fix3 0.026658535 is 2.526x faster
fix4 0.025254488 is 2.667x faster

G=3
org1 0.027746677 is 2.495x faster
fix1 0.069227457
org2 0.033862829 is 2.044x faster
fix2 0.031069279 is 2.228x faster
org3 0.287544250 is 4.154x slower
fix3 0.026713371 is 2.591x faster
fix4 0.025189638 is 2.748x faster

G=4
org1 0.027034283 is 2.527x faster
fix1 0.068307161
org2 0.033991575 is 2.010x faster
fix2 0.031272411 is 2.184x faster
org3 0.311707735 is 4.563x slower
fix3 0.026990414 is 2.531x faster
fix4 0.025078297 is 2.724x faster

G=5
org1 0.027446985 is 2.429x faster
fix1 0.066675663
org2 0.033823967 is 1.971x faster
fix2 0.031498909 is 2.117x faster
org3 0.331423283 is 4.971x slower
fix3 0.026667356 is 2.500x faster
fix4 0.026413918 is 2.524x faster

G=6
org1 0.027255535 is 2.428x faster
fix1 0.066179037
org2 0.033841848 is 1.956x faster
fix2 0.031159401 is 2.124x faster
org3 0.335711241 is 5.073x slower
fix3 0.026690722 is 2.479x faster
fix4 0.025039911 is 2.643x faster

G=7
org1 0.027280807 is 2.440x faster
fix1 0.066556692
org2 0.034326553 is 1.939x faster
fix2 0.031259060 is 2.129x faster
org3 0.331621408 is 4.983x slower
fix3 0.026686430 is 2.494x faster
fix4 0.025387526 is 2.622x faster

G=8
org1 0.027087212 is 2.453x faster
fix1 0.066447973
org2 0.033598185 is 1.978x faster
fix2 0.031176090 is 2.131x faster
org3 0.034165382 is 1.945x faster
fix3 0.026757479 is 2.483x faster
fix4 0.025131702 is 2.644x faster

G=9
org1 0.027328253 is 2.451x faster
fix1 0.066978931
org2 0.034043789 is 1.967x faster
fix2 0.031486034 is 2.127x faster
org3 0.033723354 is 1.986x faster
fix3 0.027368069 is 2.447x faster
fix4 0.025647879 is 2.611x faster

G=10
org1 0.027052402 is 2.458x faster
fix1 0.066498756
org2 0.033848524 is 1.965x faster
fix2 0.031741381 is 2.095x faster
org3 0.033836603 is 1.965x faster
fix3 0.027002096 is 2.463x faster
fix4 0.025351524 is 2.623x faster

G=11
org1 0.027157784 is 2.471x faster
fix1 0.067117691
org2 0.033848047 is 1.983x faster
fix2 0.031594038 is 2.124x faster
org3 0.034133911 is 1.966x faster
fix3 0.027194977 is 2.468x faster
fix4 0.025204659 is 2.663x faster

G=12
org1 0.027328730 is 2.432x faster
fix1 0.066454649
org2 0.033915043 is 1.959x faster
fix2 0.031331778 is 2.121x faster
org3 0.033701420 is 1.972x faster
fix3 0.026796579 is 2.480x faster
fix4 0.025482893 is 2.608x faster

G=13
org1 0.027091503 is 2.520x faster
fix1 0.068269968
org2 0.033600807 is 2.032x faster
fix2 0.031302691 is 2.181x faster
org3 0.034220219 is 1.995x faster
fix3 0.026732683 is 2.554x faster
fix4 0.025168657 is 2.712x faster

G=14
org1 0.027466774 is 2.403x faster
fix1 0.065990925
org2 0.034015417 is 1.940x faster
fix2 0.031306028 is 2.108x faster
org3 0.033681631 is 1.959x faster
fix3 0.026975870 is 2.446x faster
fix4 0.025142908 is 2.625x faster

G=15
org1 0.030098915 is 2.202x faster
fix1 0.066287756
org2 0.033817768 is 1.960x faster
fix2 0.031510592 is 2.104x faster
org3 0.264448166 is 3.989x slower
fix3 0.026585102 is 2.493x faster
fix4 0.025573254 is 2.592x faster

G=16
org1 0.029087305 is 2.289x faster
fix1 0.066566944
org2 0.034010649 is 1.957x faster
fix2 0.032317400 is 2.060x faster
org3 0.269736767 is 4.052x slower
fix3 0.026986122 is 2.467x faster
fix4 0.025726795 is 2.587x faster

G=17
org1 0.027568817 is 2.418x faster
fix1 0.066652775
org2 0.033725500 is 1.976x faster
fix2 0.031077385 is 2.145x faster
org3 0.270752668 is 4.062x slower
fix3 0.028372288 is 2.349x faster
fix4 0.026800632 is 2.487x faster

G=18
org1 0.028200626 is 2.466x faster
fix1 0.069550514
org2 0.035360813 is 1.967x faster
fix2 0.033010244 is 2.107x faster
org3 0.308327198 is 4.433x slower
fix3 0.028569698 is 2.434x faster
fix4 0.028189659 is 2.467x faster

G=19
org1 0.028352022 is 2.457x faster
fix1 0.069663048
org2 0.035186291 is 1.980x faster
fix2 0.033131599 is 2.103x faster
org3 0.302445412 is 4.342x slower
fix3 0.028528690 is 2.442x faster
fix4 0.026380062 is 2.641x faster

G=20
org1 0.028351307 is 2.449x faster
fix1 0.069445372
org2 0.035343409 is 1.965x faster
fix2 0.032827139 is 2.115x faster
org3 0.333808899 is 4.807x slower
fix3 0.028279066 is 2.456x faster
fix4 0.026592016 is 2.612x faster

G=21
org1 0.028333902 is 2.457x faster
fix1 0.069613457
org2 0.035215616 is 1.977x faster
fix2 0.033250570 is 2.094x faster
org3 0.326132298 is 4.685x slower
fix3 0.026517391 is 2.625x faster
fix4 0.025246382 is 2.757x faster

G=22
org1 0.027449369 is 2.421x faster
fix1 0.066462278
org2 0.033666849 is 1.974x faster
fix2 0.031057119 is 2.140x faster
org3 0.332618952 is 5.005x slower
fix3 0.028064966 is 2.368x faster
fix4 0.026383400 is 2.519x faster

G=23
org1 0.028641462 is 2.444x faster
fix1 0.070001602
org2 0.035483837 is 1.973x faster
fix2 0.033087969 is 2.116x faster
org3 0.342431068 is 4.892x slower
fix3 0.028344154 is 2.470x faster
fix4 0.026709557 is 2.621x faster

G=24
org1 0.028158426 is 2.468x faster
fix1 0.069482327
org2 0.035173178 is 1.975x faster
fix2 0.033740997 is 2.059x faster
org3 0.346288681 is 4.984x slower
fix3 0.028279781 is 2.457x faster
fix4 0.027346849 is 2.541x faster

G=25
org1 0.028361082 is 2.469x faster
fix1 0.070035458
org2 0.035205841 is 1.989x faster
fix2 0.032957315 is 2.125x faster
org3 0.035385132 is 1.979x faster
fix3 0.028091431 is 2.493x faster
fix4 0.026364803 is 2.656x faster