Question

考虑以下两个片段：

#define ALIGN_BYTES 32
#define ASSUME_ALIGNED(x) x = __builtin_assume_aligned(x, ALIGN_BYTES)

void fn0(const float *restrict a0, const float *restrict a1,
         float *restrict b, int n)
{
    ASSUME_ALIGNED(a0); ASSUME_ALIGNED(a1); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a0[i] + a1[i];
}

void fn1(const float *restrict *restrict a, float *restrict b, int n)
{
    ASSUME_ALIGNED(a[0]); ASSUME_ALIGNED(a[1]); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a[0][i] + a[1][i];
}

当我将函数编译为gcc-4.7.2 -Ofast -march=native -std=c99 -ftree-vectorizer-verbose=5 -S test.c -Wall时，我发现GCC会为第二个函数插入别名检查。

如何防止这种情况导致fn1生成的程序集与fn0的程序集相同？（当参数的数量从3增加到30时，参数传递方法（fn0）变得很麻烦，fn1方法中的别名检查次数变得荒谬。）

组装（x86-64，支持AVX的芯片）;别名在.LFB10

fn0:
.LFB9:
    .cfi_startproc
    testl   %ecx, %ecx
    jle .L1
    movl    %ecx, %r10d
    shrl    $3, %r10d
    leal    0(,%r10,8), %r9d
    testl   %r9d, %r9d
    je  .L8
    cmpl    $7, %ecx
    jbe .L8
    xorl    %eax, %eax
    xorl    %r8d, %r8d
    .p2align 4,,10
    .p2align 3
.L4:
    vmovaps (%rsi,%rax), %ymm0
    addl    $1, %r8d
    vaddps  (%rdi,%rax), %ymm0, %ymm0
    vmovaps %ymm0, (%rdx,%rax)
    addq    $32, %rax
    cmpl    %r8d, %r10d
    ja  .L4
    cmpl    %r9d, %ecx
    je  .L1
.L3:
    movslq  %r9d, %rax
    salq    $2, %rax
    addq    %rax, %rdi
    addq    %rax, %rsi
    addq    %rax, %rdx
    xorl    %eax, %eax
    .p2align 4,,10
    .p2align 3
.L6:
    vmovss  (%rsi,%rax,4), %xmm0
    vaddss  (%rdi,%rax,4), %xmm0, %xmm0
    vmovss  %xmm0, (%rdx,%rax,4)
    addq    $1, %rax
    leal    (%r9,%rax), %r8d
    cmpl    %r8d, %ecx
    jg  .L6
.L1:
    vzeroupper
    ret
.L8:
    xorl    %r9d, %r9d
    jmp .L3
    .cfi_endproc
.LFE9:
    .size   fn0, .-fn0
    .p2align 4,,15
    .globl  fn1
    .type   fn1, @function
fn1:
.LFB10:
    .cfi_startproc
    testq   %rdx, %rdx
    movq    (%rdi), %r8
    movq    8(%rdi), %r9
    je  .L12
    leaq    32(%rsi), %rdi
    movq    %rdx, %r10
    leaq    32(%r8), %r11
    shrq    $3, %r10
    cmpq    %rdi, %r8
    leaq    0(,%r10,8), %rax
    setae   %cl
    cmpq    %r11, %rsi
    setae   %r11b
    orl %r11d, %ecx
    cmpq    %rdi, %r9
    leaq    32(%r9), %r11
    setae   %dil
    cmpq    %r11, %rsi
    setae   %r11b
    orl %r11d, %edi
    andl    %edi, %ecx
    cmpq    $7, %rdx
    seta    %dil
    testb   %dil, %cl
    je  .L19
    testq   %rax, %rax
    je  .L19
    xorl    %ecx, %ecx
    xorl    %edi, %edi
    .p2align 4,,10
    .p2align 3
.L15:
    vmovaps (%r9,%rcx), %ymm0
    addq    $1, %rdi
    vaddps  (%r8,%rcx), %ymm0, %ymm0
    vmovaps %ymm0, (%rsi,%rcx)
    addq    $32, %rcx
    cmpq    %rdi, %r10
    ja  .L15
    cmpq    %rax, %rdx
    je  .L12
    .p2align 4,,10
    .p2align 3
.L20:
    vmovss  (%r9,%rax,4), %xmm0
    vaddss  (%r8,%rax,4), %xmm0, %xmm0
    vmovss  %xmm0, (%rsi,%rax,4)
    addq    $1, %rax
    cmpq    %rax, %rdx
    ja  .L20
.L12:
    vzeroupper
    ret
.L19:
    xorl    %eax, %eax
    jmp .L20
    .cfi_endproc

Answer 1

告诉编译器停止检查别名：

请添加行：

#pragma GCC ivdep

在您想要进行矢量化的循环前面，如果您需要更多信息，请阅读：

https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/Loop-Specific-Pragmas.html

Answer 2

这有帮助吗？

void fn1(const float **restrict a, float *restrict b, int n)
{
    const float * restrict a0 = a[0];
    const float * restrict a1 = a[1];

    ASSUME_ALIGNED(a0); ASSUME_ALIGNED(a1); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a0[i] + a1[i];
}

编辑：第二次尝试:)。来自http://locklessinc.com/articles/vectorize/

的信息

gcc --fast-math ...

Answer 3

那么，国旗呢

-fno-strict-aliasing

据我了解你，你只想知道如何关掉这个检查？如果这就是全部，那么gcc命令行的这个参数应该对你有帮助。

修改

除了你的评论：是不是禁止使用const类型限制指针？

这是来自ISO / IEC 9899（6.7.3.1限制的正式定义）：

1


设D是普通标识符的声明，提供指定的方法   object P作为类型T的限制限定指针。

4


在每次执行B期间，让L为任何基于P的＆amp; L的左值。如果L用于   访问它指定的对象X的值，并且X也被修改（通过任何方式），   那么以下要求适用：T不应该是合格的。每个其他左值   用于访问X的值也应该具有基于P的地址。每次访问都是如此   为了本条款的目的，修改X也应被视为修改P.如果P.   被赋予基于另一个受限指针的指针表达式E的值   对象P2，与块B2相关联，然后B2的执行将在之前开始   B的执行或B2的执行应在分配之前结束。如果这些   如果要求不符合，则行为未定义。

更有趣的一点，就像寄存器一样：

6


翻译人员可以自由地忽略使用限制的任何或所有别名含义。

因此，如果你找不到强制gcc这样做的命令参数，那么它可能是不可能的，因为从标准来看它不必提供这样做的选项。

Answer 4

我提前道歉，因为我无法在我的机器上使用GCC 4.7重现结果，但有两种可能的解决方案。

使用typedef正确撰写* restrict * restrict。这是，根据开发LLVM编译器的前同事的说法， typedef的单个例外行为就像预处理器一样 C和它的存在是为了允许你想要的抗锯齿行为。

我在下面尝试了这个，但我不确定我是否成功了。请仔细检查我的尝试。
使用using restrict qualifier with C99 variable length arrays (VLAs)的答案中描述的语法。

我在下面尝试了这个，但我不确定我是否成功了。请仔细检查我的尝试。

以下是我用来执行实验的代码，但如果我的任何一项建议都符合要求，我无法确定。

#define ALIGN_BYTES 32
#define ASSUME_ALIGNED(x) x = __builtin_assume_aligned(x, ALIGN_BYTES)

void fn0(const float *restrict a0, const float *restrict a1,
         float *restrict b, int n)
{
    ASSUME_ALIGNED(a0); ASSUME_ALIGNED(a1); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a0[i] + a1[i];
}

#if defined(ARRAY_RESTRICT)
void fn1(const float *restrict a[restrict], float * restrict b, int n)
#elif defined(TYPEDEF_SOLUTION)
typedef float * restrict frp;
void fn1(const frp *restrict a, float *restrict b, int n)
#else
void fn1(const float *restrict *restrict a, float *restrict b, int n)
#endif
{
    //ASSUME_ALIGNED(a[0]); ASSUME_ALIGNED(a[1]); ASSUME_ALIGNED(b);

    for (int i = 0; i < n; ++i)
        b[i] = a[0][i] + a[1][i];
}

再次，我为这个答案的半生不熟的性质道歉。请不要低估我的尝试但不能成功。

GCC别名检查w /限制指针

4 个答案: