Question

我为Delphi和FPC的矩阵操作编写了一个很大的库。现在存在这个用于Intel AVX扩展的库的扩展，但是我只能设法在FPC中编译。我的想法是创造 FPC中的.o文件包含AVX汇编程序代码并包含这些代码 Delphi中的文件。我试着在这里关注这个问题： Linking FPC .o files into Delphi

但没有成功。我能够转储函数名称并尝试导入这些在Delphi单元中。问题是我总是得到一个错误说 .o文件格式错误。

我使用CodeTyphoon进行编译，内部使用FPC 3.1.1和 Delphi2010作为第一次尝试。

代码一旦在FPC中编译，一次在Delphi中使用approriate编译的ifdef。

我的基本代码看起来像这样（只是摘录）：

// ###################################################################
// #### This file is part of the mathematics library project, and is
// #### offered under the licence agreement described on
// #### http://www.mrsoft.org/
// ####
// #### Copyright:(c) 2011, Michael R. . All rights reserved.
// ####
// #### Unless required by applicable law or agreed to in writing, software
// #### distributed under the License is distributed on an "AS IS" BASIS,
// #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// #### See the License for the specific language governing permissions and
// #### limitations under the License.
// ###################################################################


unit AVXMatrixMultOperations;

interface

{$IFDEF CPUX64}
{$DEFINE x64}
{$ENDIF}
{$IFDEF cpux86_64}
{$DEFINE x64}
{$ENDIF}
{$IFNDEF x64}

uses MatrixConst;

{$IFNDEF FPC}
// this fails -> wrong object format
{$L '.\AVXPrecompiled\win32\AVXMatrixMultOperations.o'}
{$ENDIF}

// full matrix operations
procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
{$IFNDEF FPC} external '' name 'AVXMATRIXMULTOPERATIONS_$$_AVXMATRIXMULTALIGNED$crc2A67AB04'; {$ENDIF}

{$ENDIF}

implementation

{$IFDEF FPC} {$ASMMODE intel} {$ENDIF}

{$IFNDEF x64}

{$IFDEF FPC}

procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
var bytesWidth2, destOffset : TASMNativeInt;
    iter : TASMNativeInt;
{$IFDEF FPC}
begin
{$ENDIF}
asm
   // prolog - simulate stack
   push ebx;
   push edi;
   push esi;

   mov ecx, dest;

   mov edi, width1;
   imul edi, -8;
   mov iter, edi;

   sub mt1, edi;

   //destOffset := destLineWidth - Width2*sizeof(double);
   mov ebx, Width2;
   shl ebx, 3;
   mov eax, destLineWidth;
   sub eax, ebx;
   mov destOffset, eax;

   //bytesWidth2 := width2*sizeof(double);
   mov bytesWidth2, ebx;

   // for y := 0 to height1 - 1 do
   @@foryloop:

      // r12 -> counter to width2
      mov esi, width2;
      sub esi, 2;
      jl @LastXColumn;

      @@forxloop:
      // for x := 0 to width2 div 2 - 1
          // esi: mt1 - width1*sizeof(double)
          // mt2: mt2
          mov edx, mt1;
          mov ebx, mt2;
          mov eax, iter;
          mov edi, LineWidth2;

          vxorpd ymm0, ymm0, ymm0;
          vxorpd ymm1, ymm1, ymm1;

          cmp eax, -32;
          jg @@Innerloop2Begin;

          // for z := 0 to width1 - 1do
          // AVX part:
          @@InnerLoop1:
             // 4x4 block
             vmovapd xmm2, [ebx];
             add ebx, edi;
             vmovapd xmm4, xmm2;

             vmovapd xmm3, [ebx];
             add ebx, edi;

             // shuffle so we can multiply

             // swap such that we can immediately multiply
             vmovlhps xmm2, xmm2, xmm3;
             vmovhlps xmm3, xmm3, xmm4;

             // next 4 elements
             vmovapd xmm4, [ebx];
             add ebx, edi;
             vmovapd xmm6, xmm4;

             vmovapd xmm5, [ebx];
             add ebx, edi;

             vmovapd ymm7, [edx + eax]

             vmovlhps xmm4, xmm4, xmm5;
             vmovhlps xmm5, xmm5, xmm6;

             vinsertf128 ymm2, ymm2, xmm4, 1;
             vinsertf128 ymm3, ymm3, xmm5, 1;

             // now multiply and add
             vmulpd ymm2, ymm2, ymm7;
             vmulpd ymm3, ymm3, ymm7;

             vaddpd ymm0, ymm0, ymm2;
             vaddpd ymm1, ymm1, ymm3;
          add eax, 32;
          jl @@InnerLoop1;

          vextractf128 xmm2, ymm0, 1;
          vextractf128 xmm3, ymm1, 1;

          vhaddpd xmm0, xmm0, xmm2;
          vhaddpd xmm1, xmm1, xmm3;

          test eax, eax;
          jz @@InnerLoopEnd2;

          @@Innerloop2Begin:

          // rest in single elements
          @@InnerLoop2:
             vmovapd xmm2, [ebx];
             add ebx, edi;

             vmovddup xmm3, [edx + eax];

             vmulpd xmm2, xmm2, xmm3;
             vmovhlps xmm4, xmm4, xmm2;

             vaddsd xmm0, xmm0, xmm2;
             vaddsd xmm1, xmm1, xmm4;
          add eax, 8;
          jnz @@InnerLoop2;

          @@InnerLoopEnd2:

          // finall horizontal addition
          vhaddpd xmm0, xmm0, xmm1;

          vmovapd [ecx], xmm0;

          // increment the pointers
          // inc(mt2), inc(dest);
          //add dword ptr [mt2], 8;
          add mt2, 16;
          add ecx, 16;

      // end for x := 0 to width2 div 2 - 1
      sub esi, 2;
      jge @@forxloop;

      @LastXColumn:

      cmp esi, -1;
      jne @NextLine;

      // last column of mt2
      mov eax, iter;
      mov ebx, mt2;

      vxorpd xmm0, xmm0, xmm0;

      @InnerLoop2:
         vmovsd xmm1, [edx + eax];
         vmovsd xmm2, [ebx];

         vmulsd xmm1, xmm1, xmm2;
         vaddsd xmm0, xmm0, xmm1;

         add ebx, edi;
      add eax, 8;
      jnz @InnerLoop2;

      vmovsd [ecx], xmm0;
      add ecx, 8;
      add mt2, 8;

      @NextLine:
      // dec(mt2, Width2);
      // inc(PByte(mt1), LineWidth1);
      // inc(PByte(dest), destOffset);
      //mov ebx, bytesWidth2;
      //sub dword ptr [mt2], ebx;
      mov eax, bytesWidth2;
      sub mt2, eax;
      mov eax, LineWidth1;
      add mt1, eax;
      add ecx, destOffset;

   // end for y := 0 to height1 - 1
   //dec eax;
   dec height1;
   jnz @@foryloop;

   // epilog
   vzeroupper;

   pop esi;
   pop edi;
   pop ebx;
end;
{$IFDEF FPC}
end;
{$ENDIF}

{$ENDIF}

{$ENDIF}

end.

Answer 1

由于此处涉及单个功能，最简单的是IMHO直接转换FPC AVXMatrixMultOperations.o文件。

使用精彩的Object file converter工具。

您可以尝试从一种二进制格式转换为另一种格式，由Delphi接受。

但我想最干净的方法是将其转换为asm：

objconv -fasm AVXMatrixMultOperations.o

它将创建一个AVXMatrixMultOperations.asm文件，该文件可用于通过简单的db ..,..,..,..字节替换未知的AVX指令。通常，生成的.asm文件的左侧是汇编程序，右侧是原始的十六进制字节。

这就是我在库中处理旧Delphi编译器的方式，例如：

function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
asm // eax=crc, edx=buf, ecx=len
        not     eax
        test    ecx, ecx
        jz      @0
        test    edx, edx
        jz      @0
@3:     test    edx, 3
        jz      @8 // align to 4 bytes boundary
        {$ifdef ISDELPHI2010}
        crc32   eax, byte ptr[edx]
        {$else}
        db      $F2, $0F, $38, $F0, $02
        {$endif}
        inc     edx
        ....

所以在你的情况下，像

{$ifdef FPC}
vinsertf128 ymm2, ymm2, xmm4, 1;
vinsertf128 ymm3, ymm3, xmm5, 1;
{$else}
db $xx,$yy,$zz
db $xx,$yy,$zz
{$endif}

在Delphi 2010中使用FPC .o文件

1 个答案: