使用__builtin_msa_ld _ *

时间:2018-10-21 05:58:48

标签: c mips simd intrinsics

我正在使用MIPS SIMD Architecture (MSA)评估Codescape GCC Toolchain编程。关于MSA和内建函数的信息很少。 (据我所知,只有两个MSA CPU,即P5600和Warrior I6400,它们在几年前就已经可以使用了。)

我的测试程序在下面。

#include <msa.h>
#include <stdint.h>

#define ALIGN16 __attribute__((aligned(16)))

int main(int argc, char* argv[])
{
    ALIGN16 uint32_t a[] = {64, 128, 256, 512};
    ALIGN16 uint32_t b[] = {1024, 2048, 4096, 8192};
    ALIGN16 uint32_t c[4];

    v4u32 va = __builtin_msa_ld_w (a, 0);
    v4u32 vb = __builtin_msa_ld_w (b, 0);

    v4u32 vc = __builtin_msa_adds_u_w (va, vb);
    __builtin_msa_st_w (vc, c, 0);

    return 0;
}

编译程序将导致以下错误。问题是vector loads return a signed vector,但是我的向量是未签名的。向量存储库也有类似的问题。

// The 4 vector loads provided through builtins
v16i8 __builtin_msa_ld_b (void *, imm_n512_511);    // byte
v8i16 __builtin_msa_ld_h (void *, imm_n1024_1022);  // half word
v4i32 __builtin_msa_ld_w (void *, imm_n2048_2044);  // word
v2i64 __builtin_msa_ld_d (void *, imm_n4096_4088);  // double word

imm_n512_511和朋友在6.59.16 MIPS SIMD Architecture (MSA) Support的GCC手册中进行了讨论。)

我在MIPS SIMD Architecture阅读了MIPS论文(?),但没有讨论如何在整数向量类型之间进行转换。浮点转换指令很多,但整数类型则没有。

简单的强制转换是在整数矢量类型之间进行转换的首选方法吗?还是我应该做些其他事情?


MSA$ mips-img-linux-gnu-gcc.exe -mmsa test.c -c
test.c: In function 'main':
test.c:12:2: note: use -flax-vector-conversions to permit conversions between ve
ctors with differing element types or numbers of subparts
  v4u32 va = __builtin_msa_ld_w (a, 0);
  ^~~~~
test.c:12:13: error: incompatible types when initializing type 'v4u32 {aka __vec
tor(4) unsigned int}' using type '__vector(4) int'
  v4u32 va = __builtin_msa_ld_w (a, 0);
             ^~~~~~~~~~~~~~~~~~
test.c:13:13: error: incompatible types when initializing type 'v4u32 {aka __vec
tor(4) unsigned int}' using type '__vector(4) int'
  v4u32 vb = __builtin_msa_ld_w (b, 0);
             ^~~~~~~~~~~~~~~~~~
test.c:16:22: error: incompatible type for argument 1 of '__builtin_msa_st_w'
  __builtin_msa_st_w (vc, c, 0);
                      ^~
test.c:16:22: note: expected '__vector(4) int' but argument is of type 'v4u32 {a
ka __vector(4) unsigned int}'

2 个答案:

答案 0 :(得分:2)

要么使用强制转换和-flax-vector-conversions,要么使用并集类型来表示向量寄存器并显式地对该联合类型进行处理。 GCC明确支持这种类型的类型处理。

例如,您可以声明一个msa128类型,

typedef union __attribute__ ((aligned (16))) {
    v2u64   u64;
    v2i64   i64;
    v2f64   f64;
    v4u32   u32;
    v4i32   i32;
    v4f32   f32;
    v8u16   u16;
    v8i16   i16;
    v16u8   u8;
    v16i8   i8;
} msa128;

,然后让您的代码在msa128类型上显式工作。您的示例程序可以写为

    uint32_t a[4] = { 64, 128, 256, 512 };
    uint32_t b[4] = { 1024, 2048, 4096, 8192 };
    uint32_t c[4];
    msa128   va, vb, vc;

    va.i32 = __builtin_msa_ld_w(a, 0);
    vb.i32 = __builtin_msa_ld_w(b, 0);
    vc.u32 = __builtin_msa_adds_u_w(va.u32, vb.u32);
    __builtin_msa_st_w(vc.i32, c, 0);

很明显,记住一个人需要使用的确切类型变得很烦人,因此,某些静态内联帮助函数肯定很方便:

static inline msa128  msa128_load64(const void *from, const int imm)
{ return (msa128){ .i64 = __builtin_msa_ld_d(from, imm); } }

static inline msa128  msa128_load32(const void *from, const int imm)
{ return (msa128){ .i32 = __builtin_msa_ld_w(from, imm); } }

static inline msa128  msa128_load16(const void *from, const int imm)
{ return (msa128){ .i16 = __builtin_msa_ld_h(from, imm); } }

static inline msa128  msa128_load8(const void *from, const int imm)
{ return (msa128){ .i8  = __builtin_msa_ld_b(from, imm); } }

static inline void  msa128_store64(const msa128 val, void *to, const int imm)
{ __builtin_msa_st_d(val.i64, to, imm); }

static inline void  msa128_store32(const msa128 val, void *to, const int imm)
{ __builtin_msa_st_w(val.i32, to, imm); }

static inline void  msa128_store16(const msa128 val, void *to, const int imm)
{ __builtin_msa_st_h(val.i16, to, imm); }

static inline void  msa128_store8(const msa128 val, void *to, const int imm)
{ __builtin_msa_st_b(val.i8, to, imm); }

例如,二进制AND,OR,NOR和XOR操作是

static inline msa128  msa128_and(const msa128 a, const msa128 b)
{ return (msa128){ .u8 = __builtin_msa_and_v(a, b) }; }

static inline msa128  msa128_or(const msa128 a, const msa128 b)
{ return (msa128){ .u8 = __builtin_msa_or_v(a, b) }; }

static inline msa128  msa128_nor(const msa128 a, const msa128 b)
{ return (msa128){ .u8 = __builtin_msa_nor_v(a, b) }; }

static inline msa128  msa128_xor(const msa128 a, const msa128 b)
{ return (msa128){ .u8 = __builtin_msa_xor_v(a, b) }; }

创建一些宏以数组形式表示向量可能不会受到损害:

#define  MSA128_U64(...)  ((msa128){ .u64 = { __VA_ARGS__ }})
#define  MSA128_I64(...)  ((msa128){ .i64 = { __VA_ARGS__ }})
#define  MSA128_F64(...)  ((msa128){ .f64 = { __VA_ARGS__ }})
#define  MSA128_U32(...)  ((msa128){ .u32 = { __VA_ARGS__ }})
#define  MSA128_I32(...)  ((msa128){ .i32 = { __VA_ARGS__ }})
#define  MSA128_F32(...)  ((msa128){ .f32 = { __VA_ARGS__ }})
#define  MSA128_U16(...)  ((msa128){ .u16 = { __VA_ARGS__ }})
#define  MSA128_I16(...)  ((msa128){ .i16 = { __VA_ARGS__ }})
#define  MSA128_U8(...)   ((msa128){ .u8  = { __VA_ARGS__ }})
#define  MSA128_I8(...)   ((msa128){ .i8  = { __VA_ARGS__ }})

我之所以建议使用这种特定于GCC的方法的原因是,无论如何,内置函数都是GCC特定的。除了联合类型之外,它与GCC如何在<immintrin.h>中实现Intel / AMD向量内在函数非常接近。

答案 1 :(得分:0)

这里是可与C和C ++一起使用的替代方法。它对寄存器变量执行memcpy。内联函数来自ARM NEON支持。 ARM为NEON向量提供了强制转换,例如vreinterpretq_u64_u8。函数上的inline需要C99。

#include <msa.h>
#include <stdint.h>
#include <string.h>

inline v4i32 reinterpretq_i32_u32(const v4u32 val) {
    v4i32 res;
    memcpy(&res, &val, sizeof(res));
    return res;
}

inline v4u32 reinterpretq_u32_i32(const v4i32 val) {
    v4u32 res;
    memcpy(&res, &val, sizeof(res));
    return res;
}

#define ALIGN16 __attribute__((aligned(16)))

int main(int argc, char* argv[])
{
    ALIGN16 uint32_t a[] = {64, 128, 256, 512};
    ALIGN16 uint32_t b[] = {1024, 2048, 4096, 8192};
    ALIGN16 uint32_t c[4];

    v4u32 va = reinterpretq_u32_i32(__builtin_msa_ld_w (a, 0));
    v4u32 vb = reinterpretq_u32_i32(__builtin_msa_ld_w (b, 0));

    v4u32 vc = __builtin_msa_adds_u_w (va, vb);
    __builtin_msa_st_w (reinterpretq_i32_u32(vc), c, 0);

    return 0;
}

然后在-O3进行编译(在-Wall -Wextra进行编译):

MSA$ mips-img-linux-gnu-gcc.exe -O3 -mmsa test.c -c
MSA$

反汇编看起来像通过了嗅探测试:

MSA$ mips-img-linux-gnu-objdump.exe --disassemble test.o

test.o:     file format elf32-tradbigmips

Disassembly of section .text:

00000000 <main>:
   0:   27bdffc8        addiu      sp,sp,-56
   4:   3c020000        lui        v0,0x0
   8:   24420000        addiu      v0,v0,0
   c:   78001062        ld.w       $w1,0(v0)
  10:   3c020000        lui        v0,0x0
  14:   24420000        addiu      v0,v0,0
  18:   78001022        ld.w       $w0,0(v0)
  1c:   79c10010        adds_u.w   $w0,$w0,$w1
  20:   7802e826        st.w       $w0,8(sp)
  24:   93a2000b        lbu        v0,11(sp)
  28:   03e00009        jr         ra
  2c:   27bd0038        addiu      sp,sp,56

出于完整性考虑,GCC 6.3.0:

MSA$ mips-img-linux-gnu-gcc.exe --version
mips-img-linux-gnu-gcc.exe (Codescape GNU Tools 2017.10-05 for MIPS IMG Linux) 6.3.0
Copyright (C) 2016 Free Software Foundation, Inc.