我使用Arm NEON内部函数为float16向量(作为uint16移交)构建了一个矩阵乘法函数。运行程序本身可以正常运行,但是使用valgrind / callgrind会崩溃。这是测试程序的功能:
#include <stdlib.h>
#include <math.h>
#include <arm_neon.h>
#include <stdio.h>
#include <vector>
#include <glm/vec3.hpp> // glm::vec3
#include <glm/vec4.hpp> // glm::vec4, glm::ivec4
#include <glm/mat4x4.hpp> // glm::mat4
#include <glm/gtc/matrix_transform.hpp> // glm::translate, glm::rotate, glm::scale, glm::perspective
#include <glm/gtc/type_ptr.hpp> // glm::value_ptr
#include <glm/gtc/packing.hpp>
void __attribute__ ((noinline))
transformVectorU16 ( glm::mat4 const & matrix,
std::vector < glm::u16vec4 > const & input,
std::vector < glm::vec4 > & output )
{
float32x4x4_t iMatrix = *(float32x4x4_t *)&matrix;
float32x4_t rslt;
std::vector < glm::u16vec4 >::const_iterator inVertexStart = input.begin();
std::vector < glm::u16vec4 >::const_iterator inVertexEnd = input.end();
std::vector < glm::vec4 >::iterator outVertexStart = output.begin();
for ( ; inVertexStart != inVertexEnd; inVertexStart++, outVertexStart++ )
{
const float16x4_t input_local = *( float16x4_t const * )&(*inVertexStart);
const float32x4_t input_local_32 = vcvt_f32_f16( input_local );
rslt = vmulq_f32( iMatrix.val[0], input_local_32 );
rslt = vmlaq_f32(rslt, iMatrix.val[1], input_local_32 );
rslt = vmlaq_f32(rslt, iMatrix.val[2], input_local_32 );
rslt = vmlaq_f32(rslt, iMatrix.val[3], input_local_32 );
vst1q_f32( (float32_t*)&( *outVertexStart ), rslt);
}
}
int main(int argc, char* argv[])
{
glm::mat4 matrix( 1,0,0,13,
2,0,0,14,
3,0,0,15,
4,0,0,16 );
union Convert
{
glm::uint16 val;
__fp16 t;
} u[4];
std::vector < glm::u16vec4 > c;
std::vector < glm::vec4 > b;
size_t num = rand() % 10000;
for ( size_t i = 0; i < num; ++i )
{
u[0].t = float(c.size());
u[1].t = float(c.size());
u[2].t = float(c.size());
u[3].t = float(c.size());
c.push_back ( glm::u16vec4 ( u[0].val,
u[1].val,
u[2].val,
u[3].val ) );
}
b.resize ( c.size() );
transformVectorU16 ( matrix, c , b );
for ( auto & bentry : b )
printf("%f %f %f %f\n", bentry[0], bentry[1], bentry[2], bentry[3] );
return 0;
}
这是崩溃报告:
==433== Memcheck, a memory error detector
==433== Copyright (C) 2002-2017, and GNU GPL'd, by Julian Seward et al.
==433== Using Valgrind-3.13.0 and LibVEX; rerun with -h for copyright info
==433== Command: ./vectortest
==433==
t3 = GET:F16(338)
vex: the `impossible' happened:
iselStmt
vex storage: T total 61973608 bytes allocated
vex storage: P total 0 bytes allocated
valgrind: the 'impossible' happened:
LibVEX called failure_exit().
host stacktrace:
==433== at 0x5803CF0C: show_sched_status_wrk (m_libcassert.c:355)
==433== by 0x5803D043: report_and_quit (m_libcassert.c:426)
==433== by 0x5803D277: panic (m_libcassert.c:502)
==433== by 0x5803D277: vgPlain_core_panic_at (m_libcassert.c:507)
==433== by 0x5803D297: vgPlain_core_panic (m_libcassert.c:512)
==433== by 0x5805A907: failure_exit (m_translate.c:740)
==433== by 0x5810BF33: vpanic (main_util.c:231)
==433== by 0x58164E1F: iselStmt (host_arm64_isel.c:4003)
==433== by 0x58164E1F: iselSB_ARM64 (host_arm64_isel.c:4201)
==433== by 0x58109AFB: libvex_BackEnd (main_main.c:1047)
==433== by 0x58109AFB: LibVEX_Translate (main_main.c:1174)
==433== by 0x5805CFCB: vgPlain_translate (m_translate.c:1794)
==433== by 0x58093DD7: handle_chain_me (scheduler.c:1084)
==433== by 0x58095A2F: vgPlain_scheduler (scheduler.c:1428)
==433== by 0x580A69A3: thread_wrapper (syswrap-linux.c:103)
==433== by 0x580A69A3: run_a_thread_NORETURN (syswrap-linux.c:156)
==433== by 0xFFFFFFFFFFFFFFFF: ???
sched status:
running_tid=1
Thread 1: status = VgTs_Runnable (lwpid 433)
==433== at 0x400C1C: vcvt_f32_f16 (arm_neon.h:14818)
==433== by 0x400C1C: transformVectorU16(glm::tmat4x4<float, (glm::precision)0> const&, std::vector<glm::tvec4<unsigned short, (glm::precision)0>, std::allocator<glm::tvec4<unsigned short, (glm::precision)0> > > const&, std::vector<glm::tvec4<float, (glm::precision)0>, std::allocator<glm::tvec4<float, (glm::precision)0> > >&) (vectortestcleaned.cpp:28)
==433== by 0x400907: main (vectortestcleaned.cpp:68)
Note: see also the FAQ in the source distribution.
It contains workarounds to several common problems.
In particular, if Valgrind aborted or crashed after
identifying problems in your program, there's a good chance
that fixing those problems will prevent Valgrind aborting or
crashing, especially if it happened in m_mallocfree.c.
If that doesn't help, please report this bug to: www.valgrind.org
In the bug report, send all the above text, the valgrind
version, and what OS and version you are using. Thanks.
此错误消息对我来说是个秘密。你能告诉我怎么了吗?
致谢