使用hs_err_pid * .log和gdb识别崩溃

时间:2011-08-02 09:05:08

标签: gdb

2011年9月12日更新

我能够获取核心文件并立即解除崩溃的指令。根据建议我跟踪r28的值(顺便说一下,没有注册表项记录到hs_err_pid * .log)并检查值的来源(见下面w /< ---)。但是,我无法确定r32的值。

错失对齐的原因是r28是一个加载到4字节整数r31的8字节整数吗?

;;;  1053               if( Transfer( len ) == FALSE ) {
0xc00000000c0c55c0:2 <TFM::PrintTrace(..)+0x32>:        adds         r44=0x480,r32;;  <---
0xc00000000c0c55d0:0 <TFM::PrintTrace(..)+0x40>:        ld8          r43=[ret2]
0xc00000000c0c55d0:1 <TFM::PrintTrace(..)+0x41>:  (p6)  st4          [r35]=ret3
0xc00000000c0c55d0:2 <TFM::PrintTrace(..)+0x42>:        adds         r48=28,r33
0xc00000000c0c55e0:0 <TFM::PrintTrace(..)+0x50>:        mov          ret0=0;;
0xc00000000c0c55e0:1 <TFM::PrintTrace(..)+0x51>:        ld8.c.clr    r62=[r45]
0xc00000000c0c55e0:2 <TFM::PrintTrace(..)+0x52>:        cmp.eq.unc   p6,p1=r0,r62
;;;  1056                       throw MutexLock ;
0xc00000000c0c55f0:0 <TFM::PrintTrace(..)+0x60>:        nop.m        0x0
0xc00000000c0c55f0:1 <TFM::PrintTrace(..)+0x61>:        nop.m        0x0
0xc00000000c0c55f0:2 <TFM::PrintTrace(..)+0x62>:  (p6)  br.cond.dpnt.many _NZ10TFM07PrintTraceEPi+0x800;;
;;;  1057               }
0xc00000000c0c5600:0 <TFM::PrintTrace(..)+0x70>:        adds         r41=0x488,r32
0xc00000000c0c5600:1 <TFM::PrintTrace(..)+0x71>:        adds         r40=0x490,r32
0xc00000000c0c5600:2 <TFM::PrintTrace(..)+0x72>:       br.call.dptk.many rp=0xc00000000c080620;;
;;;  1060               dwDataLen = len ;
0xc00000000c0c5610:0 <TFM::PrintTrace(..)+0x80>:        ld8          r16=[r44]    <---
0xc00000000c0c5610:1 <TFM::PrintTrace(..)+0x81>:        mov          gp=r36
0xc00000000c0c5610:2 <TFM::PrintTrace(..)+0x82>:  (p1)  mov          r62=8;;
0xc00000000c0c5620:0 <TFM::PrintTrace(..)+0x90>:        cmp.eq.unc   p6=r0,r16
0xc00000000c0c5620:1 <TFM::PrintTrace(..)+0x91>:        nop.m        0x0
0xc00000000c0c5620:2 <TFM::PrintTrace(..)+0x92>:  (p6)  br.cond.dpnt.many _NZ10TFM07PrintTraceEPi+0xda0;;
0xc00000000c0c5630:0 <TFM::PrintTrace(..)+0xa0>:        adds         r21=16,r16   <---
0xc00000000c0c5630:1 <TFM::PrintTrace(..)+0xa1>:  (p1)  mov          r62=8;;
0xc00000000c0c5630:2 <TFM::PrintTrace(..)+0xa2>:        nop.i        0x0
0xc00000000c0c5640:0 <TFM::PrintTrace(..)+0xb0>:        ld8          r42=[r21];;  <---
0xc00000000c0c5640:1 <TFM::PrintTrace(..)+0xb1>:        cmp.eq.unc   p6=r0,r42
0xc00000000c0c5640:2 <TFM::PrintTrace(..)+0xb2>:        nop.i        0x0
0xc00000000c0c5650:0 <TFM::PrintTrace(..)+0xc0>:        nop.m        0x0
0xc00000000c0c5650:1 <TFM::PrintTrace(..)+0xc1>:        mov          r47=5
0xc00000000c0c5650:2 <TFM::PrintTrace(..)+0xc2>:  (p6)  br.cond.dpnt.many _NZ10TFM07PrintTraceEPi+0xdf0;;
0xc00000000c0c5660:0 <TFM::PrintTrace(..)+0xd0>:        ld4.a        r27=[r48]
;;;  1064                       if( dwDataLen <= dwViewLen ) {
0xc00000000c0c5660:1 <TFM::PrintTrace(..)+0xd1>:        adds         r28=28,r42   <--
0xc00000000c0c5660:2 <TFM::PrintTrace(..)+0xd2>:        cmp.ne.unc   p6=r0,r46;;
0xc00000000c0c5670:0 <TFM::PrintTrace(..)+0xe0>:        ld4.sa       r26=[r28],
0xc00000000c0c5670:1 <TFM::PrintTrace(..)+0xe1>:  (p6)  ld4          r31=[r28]    <-- instruction that crashed

如果需要注册值,请告诉我。我想我可以使用gdb的info reg命令获取寄存器值。 这是信息寄存器的结果(我排除了prXXX和brXXX的值),我不知道如何将这些映射到上面的反汇编指令。

  gr1: 0x9fffffffbf716588
  gr2: 0x9fffffff5f667c00
  gr3: 0x9fffffff5f667c00
  gr4: 0x6000000000e0b000
  gr5: 0x9fffffff8adfe2e0
  gr6: 0x9fffffff8ada9000
  gr7: 0x9fffffff8ad7a000
  gr8:                0x1
  gr9: 0x9fffffff8adfd0f0
 gr10:                  0
 gr11: 0xc000000000000690
 gr12: 0x9fffffff8adfd140
 gr13: 0x6000000001681510
 gr14: 0x9fffffffbf7d8e98
 gr15:               0x1a
 gr16: 0x60000000044dac60
 gr17:               0x1f
 gr18:                  0
 gr19: 0x9fffffff8ad023f0
 gr20: 0x9fffffff8adfd0e0
 gr21: 0x60000000044dac70
 gr22: 0x9fffffff5f668000
 gr23:                0xd
 gr24:                0x1
 gr25: 0xc0000000004341f0
 gr26:                NaT
 gr27:               0x63
 gr28: 0xc00000000c5f801c
 gr29: 0xc00000000029db20
 gr30: 0xc00000000029db20
 gr31:              0x288
 gr32: 0x60000000044796d0
 gr33: 0x6000000001a78910
 gr34:               0x7e
 gr35: 0x6000000001d03a90
 gr36: 0x9fffffffbf716588
 gr37: 0xc000000000000c9d
 gr38: 0xc00000000c0c4f70
 gr39:                0x9
 gr40: 0x6000000004479b60
 gr41: 0x6000000004479b58
 gr42: 0xc00000000c5f8000
 gr43: 0x9fffffffbf7144e0
 gr44: 0x6000000004479b50
 gr45: 0x6000000004479b68
 gr46: 0x6000000001d03a90
 gr47:                0x5
 gr48: 0x6000000001a7892c
 gr49: 0x9fffffff8adfe110
 gr50: 0xc000000000000491
 gr51: 0xc00000000c0c5520
 gr52: 0xc00000000c07dd10
 gr53: 0x9fffffff8adfe120
 gr54: 0x9fffffff8adfe0a0
 gr55: 0xc00000000000058e
 gr56: 0xc00000000042be40
 gr57:               0x39
 gr58:                0x3
 gr59:               0x33
 gr60:                  0
 gr61: 0x9fffffffbf7d2598
 gr62:                0x8
 gr63: 0x9fffffffbf716588
 gr64: 0xc000000000000f22
 gr65: 0xc00000000c0c5610

这是我上一篇文章的更新。因为我提供了一份副本 在核心文件中,我使用gdb检查核心文件并执行 以下命令:

1)bt

2)帧n&lt; - 发生中止的帧

3)disas

以下是结果。

(gdb) bt
#0  0xc0000000001e5350:0 in _lwp_kill+0x30 ()
   from /usr/lib/hpux64/libpthread.so.1
#1  0xc00000000014c7b0:0 in pthread_kill+0x9d0 ()
   from /usr/lib/hpux64/libpthread.so.1
#2  0xc0000000002e4080:0 in raise+0xe0 () from /usr/lib/hpux64/libc.so.1
#3  0xc0000000003f47f0:0 in abort+0x170 () from /usr/lib/hpux64/libc.so.1
#4  0xc00000000e65e0d0:0 in os::abort ()
    at /CLO/Components/JAVA_HOTSPOT/Src/src/os/hp-ux/vm/os_hp-ux.cpp:2033
#5  0xc00000000eb473e0:0 in VMError::report_and_die ()
    at /CLO/Components/JAVA_HOTSPOT/Src/src/share/vm/utilities/vmError.cpp:1008
#6  0xc00000000e66fc90:0 in os::Hpux::JVM_handle_hpux_signal ()
    at /CLO/Components/JAVA_HOTSPOT/Src/src/os_cpu/hp-ux_ia64/vm/os_hp-ux_ia64.cpp:1051
#7  <signal handler called>
#8  0xc00000000c0c5670:1 in TFMTrace::PrintTrace () at tfmtrace.cpp:1064
#9  0xc00000000c0c4f70:0 in FMLogger::WriteLog () at fmlogger.cpp:90
...

(gdb) frame 8
#8  0xc00000000c0c5670:1 in TFMTrace::PrintTrace () at tfmtrace.cpp:1064
1064                            if( dwDataLen <= dwViewLen ) {
Current language:  auto; currently c++

(gdb) disas $pc-16*4 $pc+16*4
...
0xc00000000c0c5660:0 <TFMTrace::PrintTrace(...)+0xd0> :       ld4.a        r27=[r48]        MII,
;;;  1064               if( dwDataLen <= dwViewLen ) {
0xc00000000c0c5660:1 <TFMTrace::PrintTrace(...)+0xd1> :       adds         r28=28,r42
0xc00000000c0c5660:2 <TFMTrace::PrintTrace(...)+0xd2> :        cmp.ne.unc   p6=r0,r46;;
0xc00000000c0c5670:0 <TFMTrace::PrintTrace(...)+0xe0> :        ld4.sa      r26=[r28]        MMI,
0xc00000000c0c5670:1 <TFMTrace::PrintTrace(...)+0xe1> :  (p6)  ld4         r31=[r28]
0xc00000000c0c5670:2 <TFMTrace::PrintTrace(...)+0xe2> :       adds         r46=24,r42;;
0xc00000000c0c5680:0 <TFMTrace::PrintTrace(...)+0xf0> :  (p6)  st4       [r35]=r31         MI,I
0xc00000000c0c5680:1 <TFMTrace::PrintTrace(...)+0xf1> :       adds         r59=36,r42;;
0xc00000000c0c5680:2 <TFMTrace::PrintTrace(...)+0xf2> :        nop.i           0x0
0xc00000000c0c5690:0 <TFMTrace::PrintTrace(...)+0x100>:        ld4.c.clr   r27=[r48]        MIB,
;;;  1066               dwLen = dwTrcLen ;
0xc00000000c0c5690:1 <TFMTrace::PrintTrace(...)+0x101>:       cmp4.eq.unc  p6,p8=99,r27
0xc00000000c0c5690:2 <TFMTrace::PrintTrace(...)+0x102>:        nop.b           0x0;;
0xc00000000c0c56a0:0 <TFMTrace::PrintTrace(...)+0x110>:  (p8)  ld4.c.clr   r26=[r28]        MMI
;;;  1067               }
0xc00000000c0c56a0:1 <TFMTrace::PrintTrace(...)+0x111>:  (p6)  st4       [r48]=r47
0xc00000000c0c56a0:2 <TFMTrace::PrintTrace(...)+0x112>:       cmp4.geu.unc  p7=r26,r27
End of assemb

1 个答案:

答案 0 :(得分:0)

本机代码中的“正常”崩溃会导致报告如下:

C  [libc.so.6+0x88368]  strstr+0x64a

注意函数(在这种情况下为strstr)到崩溃点的小偏移量。

在你的情况下,JVM决定地址oxc00000000f675671libtracejni.so内,但它能找到的最接近的函数是非常远离崩溃点(0x5065eff9 == 1.2 GB以外。

你的图书馆真的那么大吗?

如果它真的那么大,你可能已经剥离了它,因此符号_NZ10TFM07PrintTraceEPi实际上与问题没有任何关系(在1.2GB的代码中)。

您需要在崩溃时找出地址oxc00000000f675671处的代码。通常hs_err_pid*.log包含所有共享库的加载地址列表。找到libtracejni.so的加载地址,从pc中减去它。这应该会为您提供类似于0x400...675671的地址,您应该可以在未提取的libtracejni.so版本中查找。

另请注意,崩溃地址以ASCII "C8G"结尾,这可能是也可能不是巧合。

更新2011/08/05。

现在您知道哪条指令崩溃了:

0x4000000000099670:1 <TFMTrace::PrintTrace(...)+0xe1>: (p6) ld4         r31=[r28]

这是r28指向的内存中的4字节整数加载。

接下来的问题是:r28在崩溃点的价值是什么(应该记录在hs_err * .log中),以及它来自哪里(完全反汇编TFM::PrintTrace会告诉我们那个)。