Question

我使用Arm NEON内部函数为float16向量（作为uint16移交）构建了一个矩阵乘法函数。运行程序本身可以正常运行，但是使用valgrind / callgrind会崩溃。这是测试程序的功能：

#include <stdlib.h>
#include <math.h>
#include <arm_neon.h>
#include <stdio.h>
#include <vector>
#include <glm/vec3.hpp> // glm::vec3
#include <glm/vec4.hpp> // glm::vec4, glm::ivec4
#include <glm/mat4x4.hpp> // glm::mat4
#include <glm/gtc/matrix_transform.hpp> // glm::translate, glm::rotate, glm::scale, glm::perspective
#include <glm/gtc/type_ptr.hpp> // glm::value_ptr
#include <glm/gtc/packing.hpp>

void __attribute__ ((noinline))
transformVectorU16 ( glm::mat4 const & matrix,
                  std::vector < glm::u16vec4 > const & input,
                  std::vector < glm::vec4 >          & output )
{
   float32x4x4_t iMatrix = *(float32x4x4_t *)&matrix;
   float32x4_t rslt;

   std::vector < glm::u16vec4 >::const_iterator inVertexStart  = input.begin();
   std::vector < glm::u16vec4 >::const_iterator inVertexEnd    = input.end();
   std::vector < glm::vec4 >::iterator          outVertexStart = output.begin();

   for ( ; inVertexStart != inVertexEnd; inVertexStart++, outVertexStart++ )
   {
      const float16x4_t input_local = *( float16x4_t const * )&(*inVertexStart);
      const float32x4_t input_local_32 = vcvt_f32_f16( input_local );
      rslt = vmulq_f32(      iMatrix.val[0], input_local_32 );
      rslt = vmlaq_f32(rslt, iMatrix.val[1], input_local_32 );
      rslt = vmlaq_f32(rslt, iMatrix.val[2], input_local_32 );
      rslt = vmlaq_f32(rslt, iMatrix.val[3], input_local_32 );

      vst1q_f32( (float32_t*)&( *outVertexStart ), rslt);
   }
}


int main(int argc, char* argv[])
{ 
   glm::mat4 matrix( 1,0,0,13,
                     2,0,0,14,
                     3,0,0,15,
                     4,0,0,16 );

   union Convert
   {
      glm::uint16 val; 
      __fp16 t;
   } u[4];

   std::vector < glm::u16vec4 > c;
   std::vector < glm::vec4 > b;
   size_t num = rand() % 10000;
   for ( size_t i = 0; i < num; ++i )
   {
      u[0].t = float(c.size());
      u[1].t = float(c.size());
      u[2].t = float(c.size());
      u[3].t = float(c.size());
      c.push_back ( glm::u16vec4 ( u[0].val,
                                   u[1].val,
                                   u[2].val,
                                   u[3].val ) );
   }

   b.resize ( c.size() );
   transformVectorU16 ( matrix, c , b );
   for ( auto & bentry : b )
      printf("%f %f %f %f\n", bentry[0], bentry[1], bentry[2], bentry[3] );

   return 0;
}

这是崩溃报告：

==433== Memcheck, a memory error detector
==433== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==433== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==433== Command: ./vectortest
==433== 
t3 = GET:F16(338)
vex: the `impossible' happened:
   iselStmt
vex storage: T total 61973608 bytes allocated
vex storage: P total 0 bytes allocated

valgrind: the 'impossible' happened:
   LibVEX called failure_exit().

host stacktrace:
==433==    at 0x5803CF0C: show_sched_status_wrk (m_libcassert.c:355)
==433==    by 0x5803D043: report_and_quit (m_libcassert.c:426)
==433==    by 0x5803D277: panic (m_libcassert.c:502)
==433==    by 0x5803D277: vgPlain_core_panic_at (m_libcassert.c:507)
==433==    by 0x5803D297: vgPlain_core_panic (m_libcassert.c:512)
==433==    by 0x5805A907: failure_exit (m_translate.c:740)
==433==    by 0x5810BF33: vpanic (main_util.c:231)
==433==    by 0x58164E1F: iselStmt (host_arm64_isel.c:4003)
==433==    by 0x58164E1F: iselSB_ARM64 (host_arm64_isel.c:4201)
==433==    by 0x58109AFB: libvex_BackEnd (main_main.c:1047)
==433==    by 0x58109AFB: LibVEX_Translate (main_main.c:1174)
==433==    by 0x5805CFCB: vgPlain_translate (m_translate.c:1794)
==433==    by 0x58093DD7: handle_chain_me (scheduler.c:1084)
==433==    by 0x58095A2F: vgPlain_scheduler (scheduler.c:1428)
==433==    by 0x580A69A3: thread_wrapper (syswrap-linux.c:103)
==433==    by 0x580A69A3: run_a_thread_NORETURN (syswrap-linux.c:156)
==433==    by 0xFFFFFFFFFFFFFFFF: ???

sched status:
  running_tid=1

Thread 1: status = VgTs_Runnable (lwpid 433)
==433==    at 0x400C1C: vcvt_f32_f16 (arm_neon.h:14818)
==433==    by 0x400C1C: transformVectorU16(glm::tmat4x4<float, (glm::precision)0> const&, std::vector<glm::tvec4<unsigned short, (glm::precision)0>, std::allocator<glm::tvec4<unsigned short, (glm::precision)0> > > const&, std::vector<glm::tvec4<float, (glm::precision)0>, std::allocator<glm::tvec4<float, (glm::precision)0> > >&) (vectortestcleaned.cpp:28)
==433==    by 0x400907: main (vectortestcleaned.cpp:68)


Note: see also the FAQ in the source distribution.
It contains workarounds to several common problems.
In particular, if Valgrind aborted or crashed after
identifying problems in your program, there's a good chance
that fixing those problems will prevent Valgrind aborting or
crashing, especially if it happened in m_mallocfree.c.

If that doesn't help, please report this bug to: www.valgrind.org

In the bug report, send all the above text, the valgrind
version, and what OS and version you are using.  Thanks.

此错误消息对我来说是个秘密。你能告诉我怎么了吗？

致谢

Valgrind在Arm64和Intrinsics上失败

0 个答案: