如果你这样做，它可以帮助gcc和clang一次复制多个字节：

Question

在C中访问结构成员时，这里有很多主题是直接访问还是间接访问（通过指针）更快。

一个例子：C pointers vs direct member access for structs

一般意见是直接访问将更快（至少在理论上），因为不使用指针解除引用。

所以我在我的系统中尝试了一大堆代码：GNU Embedded Tools GCC 4.7.4，为ARM生成代码（实际上是ARM-Cortex-A15）。

令人惊讶的是，直接访问速度要慢得多。然后我生成了目标文件的汇编代码。

直接访问代码有114行汇编代码，间接访问代码有33行汇编代码。这是怎么回事？

以下是C代码和生成的函数汇编代码。结构都映射到外部存储器，结构成员都是单字节字长（无符号字符类型）。

第一个具有间接访问权限的功能：

void sub_func_1(unsigned int num_of, struct file_s *__restrict__ first_file_ptr, struct file_s  *__restrict__ second_file_ptr, struct output_s *__restrict__ output_ptr)
{
    if(LIKELY(num_of == 0))
     {
        output_ptr->curr_id                         = UNUSED;
        output_ptr->curr_cnt                        = output_ptr->cnt;

        output_ptr->curr_mode                       = output_ptr->_mode;
        output_ptr->curr_type                       = output_ptr->type;
        output_ptr->curr_size                       = output_ptr->size;
        output_ptr->curr_allocation_type            = output_ptr->allocation_type;
        output_ptr->curr_allocation_localized       = output_ptr->allocation_localized;
        output_ptr->curr_mode_enable                = output_ptr->mode_enable;

        if(output_ptr->curr_cnt == 1)
         {
            first_file_ptr->status                  = BLOCK_IDLE;
            first_file_ptr->type                    = USER_DATA_TYPE;
            first_file_ptr->index                   = FIRST__WORD;
            first_file_ptr->layer_cnt               = output_ptr->layer_cnt;

            second_file_ptr->status                 = DISABLED;
            second_file_ptr->index                  = 0;
            second_file_ptr->redundancy_version     = 1;

            output_ptr->total_layer_cnt             = first_file_ptr->layer_cnt;
         }
     }
}

00000000 <sub_func_1>:
   0:   e3500000    cmp r0, #0
   4:   e92d01f0    push    {r4, r5, r6, r7, r8}
   8:   1a00001b    bne 7c <sub_func_1+0x7c>
   c:   e5d34007    ldrb    r4, [r3, #7]
  10:   e3a05008    mov r5, #8
  14:   e5d3c003    ldrb    ip, [r3, #3]
  18:   e5d38014    ldrb    r8, [r3, #20]
  1c:   e5c35001    strb    r5, [r3, #1]
  20:   e5d37015    ldrb    r7, [r3, #21]
  24:   e5d36018    ldrb    r6, [r3, #24]
  28:   e5c34008    strb    r4, [r3, #8]
  2c:   e5d35019    ldrb    r5, [r3, #25]
  30:   e35c0001    cmp ip, #1
  34:   e5c3c005    strb    ip, [r3, #5]
  38:   e5d34012    ldrb    r4, [r3, #18]
  3c:   e5c38010    strb    r8, [r3, #16]
  40:   e5c37011    strb    r7, [r3, #17]
  44:   e5c3601a    strb    r6, [r3, #26]
  48:   e5c3501b    strb    r5, [r3, #27]
  4c:   e5c34013    strb    r4, [r3, #19]
  50:   1a000009    bne 7c <sub_func_1+0x7c>
  54:   e5d3400b    ldrb    r4, [r3, #11]
  58:   e3a05005    mov r5, #5
  5c:   e5c1c000    strb    ip, [r1]
  60:   e5c10002    strb    r0, [r1, #2]
  64:   e5c15001    strb    r5, [r1, #1]
  68:   e5c20000    strb    r0, [r2]
  6c:   e5c14003    strb    r4, [r1, #3]
  70:   e5c20005    strb    r0, [r2, #5]
  74:   e5c2c014    strb    ip, [r2, #20]
  78:   e5c3400f    strb    r4, [r3, #15]
  7c:   e8bd01f0    pop {r4, r5, r6, r7, r8}
  80:   e12fff1e    bx  lr

第二个功能，可直接访问：

void sub_func_2(unsigned int output_index, unsigned int cc_index, unsigned int num_of)
{
    if(LIKELY(num_of == 0))
     {
        output_file[output_index].curr_id                       = UNUSED;
        output_file[output_index].curr_cnt                      = output_file[output_index].cnt;

        output_file[output_index].curr_mode                     = output_file[output_index]._mode;
        output_file[output_index].curr_type                     = output_file[output_index].type;
        output_file[output_index].curr_size                     = output_file[output_index].size;
        output_file[output_index].curr_allocation_type          = output_file[output_index].allocation_type;
        output_file[output_index].curr_allocation_localized     = output_file[output_index].allocation_localized;
        output_file[output_index].curr_mode_enable              = output_file[output_index].mode_enable;

        if(output_file[output_index].curr_cnt == 1)
         {
            output_file[output_index].cc_file[cc_index].file[0].status              = BLOCK_IDLE;
            output_file[output_index].cc_file[cc_index].file[0].type                = USER_DATA_TYPE;
            output_file[output_index].cc_file[cc_index].file[0].index               = FIRST__WORD;
            output_file[output_index].cc_file[cc_index].file[0].layer_cnt           = output_file[output_index].layer_cnt;

            output_file[output_index].cc_file[cc_index].file[1].status              = DISABLED;
            output_file[output_index].cc_file[cc_index].file[1].index               = 0;
            output_file[output_index].cc_file[cc_index].file[1].redundancy_version  = 1;

            output_file[output_index].total_layer_cnt                               = output_file[output_index].cc_file[cc_index].file[0].layer_cnt;
         }
     }
}

00000084 <sub_func_2>:
  84:   e92d0ff0    push    {r4, r5, r6, r7, r8, r9, sl, fp}
  88:   e3520000    cmp r2, #0
  8c:   e24dd018    sub sp, sp, #24
  90:   e58d2004    str r2, [sp, #4]
  94:   1a000069    bne 240 <sub_func_2+0x1bc>
  98:   e3a03d61    mov r3, #6208   ; 0x1840
  9c:   e30dc0c0    movw    ip, #53440  ; 0xd0c0
  a0:   e340c001    movt    ip, #1
  a4:   e3002000    movw    r2, #0
  a8:   e0010193    mul r1, r3, r1
  ac:   e3402000    movt    r2, #0
  b0:   e3067490    movw    r7, #25744  ; 0x6490
  b4:   e3068488    movw    r8, #25736  ; 0x6488
  b8:   e3a0b008    mov fp, #8
  bc:   e3066498    movw    r6, #25752  ; 0x6498
  c0:   e02c109c    mla ip, ip, r0, r1
  c4:   e082c00c    add ip, r2, ip
  c8:   e28c3b19    add r3, ip, #25600  ; 0x6400
  cc:   e08c4007    add r4, ip, r7
  d0:   e5d39083    ldrb    r9, [r3, #131]  ; 0x83
  d4:   e08c5006    add r5, ip, r6
  d8:   e5d3a087    ldrb    sl, [r3, #135]  ; 0x87
  dc:   e5c3b081    strb    fp, [r3, #129]  ; 0x81
  e0:   e5c39085    strb    r9, [r3, #133]  ; 0x85
  e4:   e2833080    add r3, r3, #128    ; 0x80
  e8:   e7cca008    strb    sl, [ip, r8]
  ec:   e5d4a004    ldrb    sl, [r4, #4]
  f0:   e7cca007    strb    sl, [ip, r7]
  f4:   e5d47005    ldrb    r7, [r4, #5]
  f8:   e5c47001    strb    r7, [r4, #1]
  fc:   e7dc6006    ldrb    r6, [ip, r6]
 100:   e5d5c001    ldrb    ip, [r5, #1]
 104:   e5c56002    strb    r6, [r5, #2]
 108:   e5c5c003    strb    ip, [r5, #3]
 10c:   e5d4c002    ldrb    ip, [r4, #2]
 110:   e5c4c003    strb    ip, [r4, #3]
 114:   e5d33005    ldrb    r3, [r3, #5]
 118:   e3530001    cmp r3, #1
 11c:   1a000047    bne 240 <sub_func_2+0x1bc>
 120:   e30dc0c0    movw    ip, #53440  ; 0xd0c0
 124:   e30db0c0    movw    fp, #53440  ; 0xd0c0
 128:   e1a0700c    mov r7, ip
 12c:   e7dfc813    bfi ip, r3, #16, #16
 130:   e1a05007    mov r5, r7
 134:   e1a0900b    mov r9, fp
 138:   e02c109c    mla ip, ip, r0, r1
 13c:   e1a04005    mov r4, r5
 140:   e1a0a00b    mov sl, fp
 144:   e7df9813    bfi r9, r3, #16, #16
 148:   e7dfb813    bfi fp, r3, #16, #16
 14c:   e1a06007    mov r6, r7
 150:   e7dfa813    bfi sl, r3, #16, #16
 154:   e58dc008    str ip, [sp, #8]
 158:   e7df6813    bfi r6, r3, #16, #16
 15c:   e1a0c004    mov ip, r4
 160:   e7df4813    bfi r4, r3, #16, #16
 164:   e02b109b    mla fp, fp, r0, r1
 168:   e7df5813    bfi r5, r3, #16, #16
 16c:   e0291099    mla r9, r9, r0, r1
 170:   e7df7813    bfi r7, r3, #16, #16
 174:   e7dfc813    bfi ip, r3, #16, #16
 178:   e0261096    mla r6, r6, r0, r1
 17c:   e0241094    mla r4, r4, r0, r1
 180:   e082b00b    add fp, r2, fp
 184:   e0829009    add r9, r2, r9
 188:   e02a109a    mla sl, sl, r0, r1
 18c:   e28bbc65    add fp, fp, #25856  ; 0x6500
 190:   e58d600c    str r6, [sp, #12]
 194:   e2899c65    add r9, r9, #25856  ; 0x6500
 198:   e3a06005    mov r6, #5
 19c:   e58d4010    str r4, [sp, #16]
 1a0:   e59d4008    ldr r4, [sp, #8]
 1a4:   e0251095    mla r5, r5, r0, r1
 1a8:   e5cb3000    strb    r3, [fp]
 1ac:   e082a00a    add sl, r2, sl
 1b0:   e59db00c    ldr fp, [sp, #12]
 1b4:   e5c96001    strb    r6, [r9, #1]
 1b8:   e59d6004    ldr r6, [sp, #4]
 1bc:   e28aac65    add sl, sl, #25856  ; 0x6500
 1c0:   e58d5014    str r5, [sp, #20]
 1c4:   e0271097    mla r7, r7, r0, r1
 1c8:   e0825004    add r5, r2, r4
 1cc:   e30d40c0    movw    r4, #53440  ; 0xd0c0
 1d0:   e02c109c    mla ip, ip, r0, r1
 1d4:   e0855008    add r5, r5, r8
 1d8:   e7df4813    bfi r4, r3, #16, #16
 1dc:   e5ca6002    strb    r6, [sl, #2]
 1e0:   e5d59003    ldrb    r9, [r5, #3]
 1e4:   e082600b    add r6, r2, fp
 1e8:   e59db014    ldr fp, [sp, #20]
 1ec:   e0201094    mla r0, r4, r0, r1
 1f0:   e2866c65    add r6, r6, #25856  ; 0x6500
 1f4:   e59d1010    ldr r1, [sp, #16]
 1f8:   e306a53c    movw    sl, #25916  ; 0x653c
 1fc:   e0827007    add r7, r2, r7
 200:   e2877c65    add r7, r7, #25856  ; 0x6500
 204:   e082c00c    add ip, r2, ip
 208:   e5c69003    strb    r9, [r6, #3]
 20c:   e59d6004    ldr r6, [sp, #4]
 210:   e28ccc65    add ip, ip, #25856  ; 0x6500
 214:   e082500b    add r5, r2, fp
 218:   e0820000    add r0, r2, r0
 21c:   e0824001    add r4, r2, r1
 220:   e085500a    add r5, r5, sl
 224:   e0808008    add r8, r0, r8
 228:   e7c4600a    strb    r6, [r4, sl]
 22c:   e5c56005    strb    r6, [r5, #5]
 230:   e5c73050    strb    r3, [r7, #80]   ; 0x50
 234:   e5dc3003    ldrb    r3, [ip, #3]
 238:   e287704c    add r7, r7, #76 ; 0x4c
 23c:   e5c83007    strb    r3, [r8, #7]
 240:   e28dd018    add sp, sp, #24
 244:   e8bd0ff0    pop {r4, r5, r6, r7, r8, r9, sl, fp}
 248:   e12fff1e    bx  lr

最后一部分，我的编译选项是：

# Compile options.
C_OPTS =    -Wall \
-std=gnu99 \
-fgnu89-inline \
-Wcast-align \
-Werror=uninitialized \
-Werror=maybe-uninitialized \
-Werror=overflow \
-mcpu=cortex-a15 \
-mtune=cortex-a15 \
-mabi=aapcs \
-mfpu=neon \
-ftree-vectorize \
-ftree-slp-vectorize \
-ftree-vectorizer-verbose=4 \
-mfloat-abi=hard \
-O3 \
-flto \
-marm \
-ffat-lto-objects \
-fno-gcse \
-fno-strict-aliasing \
-fno-delete-null-pointer-checks \
-fno-strict-overflow \
-fuse-linker-plugin \
-falign-functions=4 \
-falign-loops=4 \
-falign-labels=4 \
-falign-jumps=4

更新

注意：我删除了结构定义，因为与我自己的程序版本存在差异。它实际上是一个巨大的结构，完全放在这里效率不高。

正如建议的那样，我摆脱了-fno-gcse，生成的asm并不像以前那么大。

如果没有-fno-gcse，sub_func_1会生成与上面相同的代码。

对于sub_func_2：

00000084 <sub_func_2>:
  84:   e3520000    cmp r2, #0
  88:   e92d0070    push    {r4, r5, r6}
  8c:   1a000030    bne 154 <sub_func_2+0xd0>
  90:   e30d30c0    movw    r3, #53440  ; 0xd0c0
  94:   e3a06008    mov r6, #8
  98:   e3403001    movt    r3, #1
  9c:   e0030093    mul r3, r3, r0
  a0:   e3a00d61    mov r0, #6208   ; 0x1840
  a4:   e0213190    mla r1, r0, r1, r3
  a8:   e59f30ac    ldr r3, [pc, #172]  ; 15c <sub_func_2+0xd8>
  ac:   e0831001    add r1, r3, r1
  b0:   e2813b19    add r3, r1, #25600  ; 0x6400
  b4:   e5d34083    ldrb    r4, [r3, #131]  ; 0x83
  b8:   e1a00003    mov r0, r3
  bc:   e5d35087    ldrb    r5, [r3, #135]  ; 0x87
  c0:   e5c36081    strb    r6, [r3, #129]  ; 0x81
  c4:   e5c34085    strb    r4, [r3, #133]  ; 0x85
  c8:   e3064488    movw    r4, #25736  ; 0x6488
  cc:   e2833080    add r3, r3, #128    ; 0x80
  d0:   e7c15004    strb    r5, [r1, r4]
  d4:   e5d05094    ldrb    r5, [r0, #148]  ; 0x94
  d8:   e0844006    add r4, r4, r6
  dc:   e7c15004    strb    r5, [r1, r4]
  e0:   e5d04095    ldrb    r4, [r0, #149]  ; 0x95
  e4:   e5d0c092    ldrb    ip, [r0, #146]  ; 0x92
  e8:   e5c04091    strb    r4, [r0, #145]  ; 0x91
  ec:   e3064498    movw    r4, #25752  ; 0x6498
  f0:   e7d15004    ldrb    r5, [r1, r4]
  f4:   e5c0c093    strb    ip, [r0, #147]  ; 0x93
  f8:   e5d04099    ldrb    r4, [r0, #153]  ; 0x99
  fc:   e5c0509a    strb    r5, [r0, #154]  ; 0x9a
 100:   e5c0409b    strb    r4, [r0, #155]  ; 0x9b
 104:   e5d33005    ldrb    r3, [r3, #5]
 108:   e3530001    cmp r3, #1
 10c:   1a000010    bne 154 <sub_func_2+0xd0>
 110:   e281cc65    add ip, r1, #25856  ; 0x6500
 114:   e3a06005    mov r6, #5
 118:   e2810b19    add r0, r1, #25600  ; 0x6400
 11c:   e1a0500c    mov r5, ip
 120:   e5cc3000    strb    r3, [ip]
 124:   e1a0400c    mov r4, ip
 128:   e5cc6001    strb    r6, [ip, #1]
 12c:   e5cc2002    strb    r2, [ip, #2]
 130:   e5d0608b    ldrb    r6, [r0, #139]  ; 0x8b
 134:   e5cc6003    strb    r6, [ip, #3]
 138:   e306c53c    movw    ip, #25916  ; 0x653c
 13c:   e7c1200c    strb    r2, [r1, ip]
 140:   e5c52041    strb    r2, [r5, #65]   ; 0x41
 144:   e285503c    add r5, r5, #60 ; 0x3c
 148:   e5c43050    strb    r3, [r4, #80]   ; 0x50
 14c:   e284404c    add r4, r4, #76 ; 0x4c
 150:   e5c0608f    strb    r6, [r0, #143]  ; 0x8f
 154:   e8bd0070    pop {r4, r5, r6}
 158:   e12fff1e    bx  lr
 15c:   00000000    .word   0x00000000

Answer 1

TL：DR ：无法重现那个疯狂的编译器输出。也许周围的代码+ LTO做到了吗？

我确实有改进代码的建议：请参阅下面有关复制整个结构而不是复制许多个别成员的内容。

您链接的问题是直接通过全局指针访问值类型全局。在ARM上，它需要多个指令或来自附近常量的负载才能将任意32位指针放入寄存器，传递指针比使每个函数直接引用全局更好。

请参阅Godbolt Compiler Explorer（ARM gcc 4.8.2 -O3）

上的此示例

struct example {
  int a, b, c;
} global_example;

int load_global(void) { return global_example.c; }
        movw    r3, #:lower16:global_example    @ tmp113,
        movt    r3, #:upper16:global_example    @ tmp113,
        ldr     r0, [r3, #8]      @, global_example.c
        bx      lr  @
int load_pointer(struct example *p) { return p->c; }
        ldr     r0, [r0, #8]      @, p_2(D)->c
        bx      lr  @

（显然gcc在使用val作为函数args传递结构时非常糟糕，请参阅godbolt链接上byval(struct example by_val)的代码。）

更糟糕的是，如果你有一个全局指针：首先你必须加载指针的值，然后另一个加载来取消引用它。这是您链接的问题中讨论的间接开销。如果两个加载都在缓存中丢失，则您需要两次支付往返延迟。在第一次加载完成之前，第二个加载的加载地址不可用，因此即使在无序CPU上也不能对这些内存请求进行流水线操作。

如果您已将指针作为arg，则它将位于寄存器中。取消引用它与从全局加载相同。（但更好的是，因为您不需要自己将全球地址输入注册表。）

您的真实代码

我无法在Godbolt上使用ARM gcc 4.8.2或使用ARM gcc 5.2.1在本地重现您的大量asm输出。但是，我没有使用LTO，因为我没有完整的测试程序。

我只能看到稍微大一点的代码来做一些索引数学。

bfi is Bitfield Insert。我认为144: e7df9813 bfi r9, r3, #16, #16设置r9的上半部分= r3的低半部分。我不明白这是怎么回事，mla（整数多累积）很有意义。除-ftree-vectorize的反常结果外，我所能想到的可能是-fno-gcse对您测试的gcc版本产生了非常糟糕的影响。

它是否正在操纵将要存储的常量？您实际发布的代码#define将所有内容都归结为0，gcc利用了这些代码。（它还利用了1在寄存器中已经有curr_cnt == 1的事实，并存储了second_file_ptr->redundancy_version = 1;的寄存器。 ARM没有str [mem], immediate或x86＆＃39; mov [mem], imm之类的东西。

如果您的编译器输出来自具有这些常量的不同值的代码，则编译器将执行更多工作来存储不同的内容。

不幸的是，gcc很难将狭窄的商店合并到一个更大的商店（长期错过优化错误）。对于x86，clang在至少一种情况下执行此操作，存储0x0100（256）而不是0和1.（通过将编译器翻转到clang 3.7.1或其他内容来检查godbolt，并删除ARM- gcc使用的特定编译器args。There's a mov word ptr \[rsi\], 256

    mov     BYTE PTR [rsi], 0 # *first_file_ptr_23(D).status,
    mov     BYTE PTR [rsi+1], 1       # *first_file_ptr_23(D).type,

如果您仔细安排了结构，那么在此功能中复制4B块的机会会更多。

curr而不是curr，而不是curr_size和size，也可能有两个相同的子结构。但是，您可能必须声明它packed以避免在子结构之后填充。您的两组成员的顺序完全相同，这样可以防止编译器在执行大量任务时进行大量的块复制。

如果你这样做，它可以帮助gcc和clang一次复制多个字节：

struct output_s_optimized {
   struct __attribute__((packed)) stuff {
       unsigned char cnt,
                    mode,
                    type,
                    size,
                    allocation_type,
                    allocation_localized,
                    mode_enable;
   } curr;  // 7B
   unsigned char    curr_id;  // no non-curr id?

   struct stuff non_curr;
   unsigned char layer_cnt;
                              // Another 8 byte boundary here
   unsigned char total_layer_cnt;

   struct cc_file_s cc_file[128];    
};

void foo(struct output_s_optimized *p) {
  p->curr_id = 0;
  p->non_curr = p->curr;
}
void bar(struct output_s_optimized *output_ptr) {
  output_ptr->curr_id = 0;
  output_ptr->curr.cnt                        = output_ptr->non_curr.cnt;
  output_ptr->curr.mode                       = output_ptr->non_curr.mode;
  output_ptr->curr.type                       = output_ptr->non_curr.type;
  output_ptr->curr.size                       = output_ptr->non_curr.size;
  output_ptr->curr.allocation_type            = output_ptr->non_curr.allocation_type;
  output_ptr->curr.allocation_localized       = output_ptr->non_curr.allocation_localized;
  output_ptr->curr.mode_enable                = output_ptr->non_curr.mode_enable;
}

gcc 4.8.2 compiles foo() to three copies：byte，2B和4B，甚至在ARM上。它将bar()编译为8个1B副本，x86上的clang-3.8也是如此。因此，复制整个结构可以帮助您的编译器（以及确保要复制的数据在两个位置以相同的顺序排列）。

x86上的相同代码：没什么新的

您可以使用-fverbose-asm对每一行发表评论。对于x86，gcc 6.1 -O3的编译器输出在版本之间非常相似，正如您在Godbolt Compiler Explorer上看到的那样。 x86寻址模式可以直接索引全局变量，所以你会看到像

这样的东西

movzx   edi, BYTE PTR [rcx+10]        # *output_ptr_7(D)._mode
# where rcx is the output_ptr arg, used directly

VS

movzx   ecx, BYTE PTR output_file[rdi+10]     # output_file[output_index_7(D)]._mode
# where rdi = output_index * 1297  (sizeof(output_file[0])), calculated once at the start

（gcc显然并不关心每条指令都有4B位移作为寻址模式的一部分，但这是一个ARM问题所以我不会在代码大小和insn计数之间进行权衡与x86＆x ＃39;可变长度的insn。）

Answer 2

在广泛的（架构不可知）术语中，这是您的指示：

global_structure_pointer-＆gt; field = value;

将global_structure_pointer的值加载到寻址寄存器中。
将field表示的偏移量添加到寻址寄存器。
将value存储到寻址寄存器寻址的存储单元中。

global_structure [index] .field = value;

将global_structure的地址加载到寻址寄存器中。
将index的值加载到算术寄存器中。
将算术寄存器乘以global_structure的大小。
将运算寄存器添加到寻址寄存器。
将value存储到寻址寄存器寻址的存储单元中。

你的困惑似乎是由于对什么＆＃34;直接访问＆＃34;的误解。实际上是。

此是直接访问：

global_structure.field = value;

您认为直接访问实际上是索引访问。

与在GCC中间接访问相比，为什么直接访问结构成员会产生更多的汇编代码？

2 个答案:

您的真实代码

如果你这样做，它可以帮助gcc和clang一次复制多个字节：

x86上的相同代码：没什么新的