在手臂上编译张量流,错误:' asm'操作数有不可能的限制

时间:2017-05-14 06:05:39

标签: tensorflow arm

我试图在我的设备上编译tensorflow(直接在板上编译,而不是跨cpmpile),但编译gemmlowp时会出现一个奇怪的问题。 这是错误日志:

In file included from external/gemmlowp/meta/streams.h:293:0,
             from external/gemmlowp/meta/quantized_mul_kernels.h:22,
             from ./tensorflow/core/kernels/meta_support.h:21,
             from tensorflow/core/kernels/meta_support.cc:18:
external/gemmlowp/meta/streams_arm_32.h: In static member function 'static void gemmlowp::meta::GemmExecutorPackLHS::ExecuteDispatch3D(const P&) [with P = gemmlowp::meta::GemmParams<unsigned char, int, gemmlowp::meta::ColumnMajorWithSum, gemmlowp::meta::RowMajorWithSum, gemmlowp::meta::QuantizedStaticPreprocessedAsInt32, gemmlowp::meta::RowMajor>; int m = 1; int n = 8; int k = 8; int m_leftovers = 0; int n_leftovers = 7; int k_leftovers = 4]':
external/gemmlowp/meta/streams_arm_32.h:4211:59: error: can't find a register in class 'LO_REGS' while reloading 'asm'
         "d25", "d26", "d27", "d28", "d29", "cc", "memory");
                                                           ^
external/gemmlowp/meta/streams_arm_32.h:4211:59: error: 'asm' operand has impossible constraints
Target //tensorflow/tools/pip_package:build_pip_package failed to build
INFO: Elapsed time: 183.585s, Critical Path: 179.60s

我的编译选项是:

bazel build -c opt --copt="-mfpu=neon-vfpv4" --copt="-funsafe-math-optimizations" --copt="-ftree-vectorize" --copt="-fomit-frame-pointer" --local_resources 1536,1.0,1.0 --verbose_failures tensorflow/tools/pip_package:build_pip_package

芯片信息:Cortex-A17

processor       : 3
model name      : ARMv7 Processor rev 1 (v7l)
BogoMIPS        : 48.00
Features        : swp half thumb fastmult vfp edsp neon vfpv3 tls vfpv4         idiva idivt vfpd32 evtstrm 
CPU implementer : 0x41
CPU architecture: 7
CPU variant     : 0x0
CPU part        : 0xc0d
CPU revision    : 1

操作系统:Ubuntu 14.04 32位

streams_arm_32.h [第4101 - 4212行]

template <>
inline void Stream<uint8_t, 7, 8, 4, RowMajorWithSum>::Pack(
    const uint8_t* in, const RowMajorWithSum& params, uint8_t* out) {
#ifdef DEBUG
#ifdef DEBUG_METAGEMM_VERBOSE
  std::cout << __FILE__ << "(" << __LINE__
            << ") RowMajorWithSum<uint8_t, 7, 8, 4, RowMajorWithSum>::Pack()"
            << std::endl
            << std::flush;
#endif
#endif
  int params_count_copy = params.count;
  asm volatile(
      "add r0, %[in], %[stride]\n"
      "add r1, r0, %[stride]\n"
      "add r2, r1, %[stride]\n"
      "add r3, r2, %[stride]\n"
      "add r4, r3, %[stride]\n"
      "add r5, r4, %[stride]\n"

      // Reduce count by leftovers.
      "sub %[count], %[count], #4\n"
      "vmov.i16 q8, #0\n"
      "vmov.i16 q9, #0\n"
      "vmov.i16 q10, #0\n"
      "vmov.i16 q11, #0\n"
      "vmov.i16 q12, #0\n"
      "vmov.i16 q13, #0\n"
      "vmov.i16 q14, #0\n"

      "1:"
      "subs %[count], %[count], #8\n"

      // Load Aggregate Store: 7x8.
      "vld1.32 {d0}, [%[in]]!\n"
      "vld1.32 {d1}, [r0]!\n"
      "vld1.32 {d2}, [r1]!\n"
      "vld1.32 {d3}, [r2]!\n"
      "vld1.32 {d4}, [r3]!\n"
      "vld1.32 {d5}, [r4]!\n"
      "vld1.32 {d6}, [r5]!\n"
      "vaddw.u8 q8, q8, d0\n"
      "vaddw.u8 q9, q9, d1\n"
      "vaddw.u8 q10, q10, d2\n"
      "vaddw.u8 q11, q11, d3\n"
      "vaddw.u8 q12, q12, d4\n"
      "vaddw.u8 q13, q13, d5\n"
      "vaddw.u8 q14, q14, d6\n"
      "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n"
      "vst1.32 {d4, d5, d6}, [%[out]:64]!\n"

      "bne 1b\n"

      // Load Aggregate Store: 7x4.
      "vmov.i8 d0, #0\n"
      "vmov.i8 d1, #0\n"
      "vmov.i8 d2, #0\n"
      "vmov.i8 d3, #0\n"
      "vmov.i8 d4, #0\n"
      "vmov.i8 d5, #0\n"
      "vmov.i8 d6, #0\n"
      "vld1.32 {d0[0]}, [%[in]]!\n"
      "vld1.32 {d1[0]}, [r0]!\n"
      "vld1.32 {d2[0]}, [r1]!\n"
      "vld1.32 {d3[0]}, [r2]!\n"
      "vld1.32 {d4[0]}, [r3]!\n"
      "vld1.32 {d5[0]}, [r4]!\n"
      "vld1.32 {d6[0]}, [r5]!\n"
      "vaddw.u8 q8, q8, d0\n"
      "vaddw.u8 q9, q9, d1\n"
      "vaddw.u8 q10, q10, d2\n"
      "vaddw.u8 q11, q11, d3\n"
      "vaddw.u8 q12, q12, d4\n"
      "vaddw.u8 q13, q13, d5\n"
      "vaddw.u8 q14, q14, d6\n"
      "vst1.32 {d0, d1, d2, d3}, [%[out]:64]!\n"
      "vst1.32 {d4, d5, d6}, [%[out]:64]!\n"

      // Aggregator Reduction.
      "vmov.32 d0[0], %[multiplicative_sum_offset]\n"
      "vdup.32 q1, %[additive_sum_offset]\n"
      "vpaddl.u16 q8, q8\n"
      "vpaddl.u16 q9, q9\n"
      "vpaddl.u16 q10, q10\n"
      "vpaddl.u16 q11, q11\n"
      "vpaddl.u16 q12, q12\n"
      "vpaddl.u16 q13, q13\n"
      "vpaddl.u16 q14, q14\n"
      "vpadd.u32 d16, d16, d17\n"
      "vpadd.u32 d18, d18, d19\n"
      "vpadd.u32 d20, d20, d21\n"
      "vpadd.u32 d22, d22, d23\n"
      "vpadd.u32 d24, d24, d25\n"
      "vpadd.u32 d26, d26, d27\n"
      "vpadd.u32 d28, d28, d29\n"
      "vpadd.u32 d16, d16, d18\n"
      "vpadd.u32 d17, d20, d22\n"
      "vpadd.u32 d18, d24, d26\n"
      "vpadd.u32 d19, d28, d28\n"
      "vmul.i32 q8, q8, d0[0]\n"
      "vmul.i32 q9, q9, d0[0]\n"
      "vadd.i32 q8, q8, q1\n"
      "vadd.i32 q9, q9, q1\n"
      "vst1.32 {d16, d17, d18, d19}, [%[out]:64]\n"
      : [count] "+r"(params_count_copy), [in] "+r"(in), [out] "+r"(out)
      : [stride] "r"(params.stride),
        [multiplicative_sum_offset] "r"(params.multiplicative_sum_offset),
        [additive_sum_offset] "r"(params.additive_sum_offset)
      : "r0", "r1", "r2", "r3", "r4", "r5", "d0", "d1", "d2", "d3", "d4", "d5",
        "d6", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24",
        "d25", "d26", "d27", "d28", "d29", "cc", "memory");
}

我按照https://github.com/samjabrahams/tensorflow-on-raspberry-pi/blob/master/GUIDE.md

的设置进行了操作

我非常感谢任何建议。

0 个答案:

没有答案