我为Delphi和FPC的矩阵操作编写了一个很大的库。 现在存在这个用于Intel AVX扩展的库的扩展,但是 我只能设法在FPC中编译。我的想法是创造 FPC中的.o文件包含AVX汇编程序代码并包含这些代码 Delphi中的文件。我试着在这里关注这个问题: Linking FPC .o files into Delphi
但没有成功。我能够转储函数名称并尝试导入 这些在Delphi单元中。问题是我总是得到一个错误说 .o文件格式错误。
我使用CodeTyphoon进行编译,内部使用FPC 3.1.1和 Delphi2010作为第一次尝试。
代码一旦在FPC中编译,一次在Delphi中使用approriate编译 的ifdef。
我的基本代码看起来像这样(只是摘录):
// ###################################################################
// #### This file is part of the mathematics library project, and is
// #### offered under the licence agreement described on
// #### http://www.mrsoft.org/
// ####
// #### Copyright:(c) 2011, Michael R. . All rights reserved.
// ####
// #### Unless required by applicable law or agreed to in writing, software
// #### distributed under the License is distributed on an "AS IS" BASIS,
// #### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// #### See the License for the specific language governing permissions and
// #### limitations under the License.
// ###################################################################
unit AVXMatrixMultOperations;
interface
{$IFDEF CPUX64}
{$DEFINE x64}
{$ENDIF}
{$IFDEF cpux86_64}
{$DEFINE x64}
{$ENDIF}
{$IFNDEF x64}
uses MatrixConst;
{$IFNDEF FPC}
// this fails -> wrong object format
{$L '.\AVXPrecompiled\win32\AVXMatrixMultOperations.o'}
{$ENDIF}
// full matrix operations
procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
{$IFNDEF FPC} external '' name 'AVXMATRIXMULTOPERATIONS_$$_AVXMATRIXMULTALIGNED$crc2A67AB04'; {$ENDIF}
{$ENDIF}
implementation
{$IFDEF FPC} {$ASMMODE intel} {$ENDIF}
{$IFNDEF x64}
{$IFDEF FPC}
procedure AVXMatrixMultAligned(dest : PDouble; const destLineWidth : TASMNativeInt; mt1, mt2 : PDouble; width1, height1, width2, height2 : TASMNativeInt; const LineWidth1, LineWidth2 : TASMNativeInt);
var bytesWidth2, destOffset : TASMNativeInt;
iter : TASMNativeInt;
{$IFDEF FPC}
begin
{$ENDIF}
asm
// prolog - simulate stack
push ebx;
push edi;
push esi;
mov ecx, dest;
mov edi, width1;
imul edi, -8;
mov iter, edi;
sub mt1, edi;
//destOffset := destLineWidth - Width2*sizeof(double);
mov ebx, Width2;
shl ebx, 3;
mov eax, destLineWidth;
sub eax, ebx;
mov destOffset, eax;
//bytesWidth2 := width2*sizeof(double);
mov bytesWidth2, ebx;
// for y := 0 to height1 - 1 do
@@foryloop:
// r12 -> counter to width2
mov esi, width2;
sub esi, 2;
jl @LastXColumn;
@@forxloop:
// for x := 0 to width2 div 2 - 1
// esi: mt1 - width1*sizeof(double)
// mt2: mt2
mov edx, mt1;
mov ebx, mt2;
mov eax, iter;
mov edi, LineWidth2;
vxorpd ymm0, ymm0, ymm0;
vxorpd ymm1, ymm1, ymm1;
cmp eax, -32;
jg @@Innerloop2Begin;
// for z := 0 to width1 - 1do
// AVX part:
@@InnerLoop1:
// 4x4 block
vmovapd xmm2, [ebx];
add ebx, edi;
vmovapd xmm4, xmm2;
vmovapd xmm3, [ebx];
add ebx, edi;
// shuffle so we can multiply
// swap such that we can immediately multiply
vmovlhps xmm2, xmm2, xmm3;
vmovhlps xmm3, xmm3, xmm4;
// next 4 elements
vmovapd xmm4, [ebx];
add ebx, edi;
vmovapd xmm6, xmm4;
vmovapd xmm5, [ebx];
add ebx, edi;
vmovapd ymm7, [edx + eax]
vmovlhps xmm4, xmm4, xmm5;
vmovhlps xmm5, xmm5, xmm6;
vinsertf128 ymm2, ymm2, xmm4, 1;
vinsertf128 ymm3, ymm3, xmm5, 1;
// now multiply and add
vmulpd ymm2, ymm2, ymm7;
vmulpd ymm3, ymm3, ymm7;
vaddpd ymm0, ymm0, ymm2;
vaddpd ymm1, ymm1, ymm3;
add eax, 32;
jl @@InnerLoop1;
vextractf128 xmm2, ymm0, 1;
vextractf128 xmm3, ymm1, 1;
vhaddpd xmm0, xmm0, xmm2;
vhaddpd xmm1, xmm1, xmm3;
test eax, eax;
jz @@InnerLoopEnd2;
@@Innerloop2Begin:
// rest in single elements
@@InnerLoop2:
vmovapd xmm2, [ebx];
add ebx, edi;
vmovddup xmm3, [edx + eax];
vmulpd xmm2, xmm2, xmm3;
vmovhlps xmm4, xmm4, xmm2;
vaddsd xmm0, xmm0, xmm2;
vaddsd xmm1, xmm1, xmm4;
add eax, 8;
jnz @@InnerLoop2;
@@InnerLoopEnd2:
// finall horizontal addition
vhaddpd xmm0, xmm0, xmm1;
vmovapd [ecx], xmm0;
// increment the pointers
// inc(mt2), inc(dest);
//add dword ptr [mt2], 8;
add mt2, 16;
add ecx, 16;
// end for x := 0 to width2 div 2 - 1
sub esi, 2;
jge @@forxloop;
@LastXColumn:
cmp esi, -1;
jne @NextLine;
// last column of mt2
mov eax, iter;
mov ebx, mt2;
vxorpd xmm0, xmm0, xmm0;
@InnerLoop2:
vmovsd xmm1, [edx + eax];
vmovsd xmm2, [ebx];
vmulsd xmm1, xmm1, xmm2;
vaddsd xmm0, xmm0, xmm1;
add ebx, edi;
add eax, 8;
jnz @InnerLoop2;
vmovsd [ecx], xmm0;
add ecx, 8;
add mt2, 8;
@NextLine:
// dec(mt2, Width2);
// inc(PByte(mt1), LineWidth1);
// inc(PByte(dest), destOffset);
//mov ebx, bytesWidth2;
//sub dword ptr [mt2], ebx;
mov eax, bytesWidth2;
sub mt2, eax;
mov eax, LineWidth1;
add mt1, eax;
add ecx, destOffset;
// end for y := 0 to height1 - 1
//dec eax;
dec height1;
jnz @@foryloop;
// epilog
vzeroupper;
pop esi;
pop edi;
pop ebx;
end;
{$IFDEF FPC}
end;
{$ENDIF}
{$ENDIF}
{$ENDIF}
end.
答案 0 :(得分:2)
由于此处涉及单个功能,最简单的是IMHO直接转换FPC AVXMatrixMultOperations.o文件。
使用精彩的Object file converter工具。
您可以尝试从一种二进制格式转换为另一种格式,由Delphi接受。
但我想最干净的方法是将其转换为asm:
objconv -fasm AVXMatrixMultOperations.o
它将创建一个AVXMatrixMultOperations.asm
文件,该文件可用于通过简单的db ..,..,..,..
字节替换未知的AVX指令。通常,生成的.asm
文件的左侧是汇编程序,右侧是原始的十六进制字节。
这就是我在库中处理旧Delphi编译器的方式,例如:
function crc32csse42(crc: cardinal; buf: PAnsiChar; len: cardinal): cardinal;
asm // eax=crc, edx=buf, ecx=len
not eax
test ecx, ecx
jz @0
test edx, edx
jz @0
@3: test edx, 3
jz @8 // align to 4 bytes boundary
{$ifdef ISDELPHI2010}
crc32 eax, byte ptr[edx]
{$else}
db $F2, $0F, $38, $F0, $02
{$endif}
inc edx
....
所以在你的情况下,像
{$ifdef FPC}
vinsertf128 ymm2, ymm2, xmm4, 1;
vinsertf128 ymm3, ymm3, xmm5, 1;
{$else}
db $xx,$yy,$zz
db $xx,$yy,$zz
{$endif}