我一直在尝试将内联汇编中的一些函数移植到独立的汇编文件中。我除了一个人之外都取得了成功;下面是内联后跟我当前的独立版本。它试图在独立版本中使用onesByte的那一刻它无法返回相同的结果。我在这里问,因为我希望有人知道编译器如何处理独立的asm文件而不是内联汇编。
内联版本:
__declspec(align(16)) const __int64 onesByte[2] = { 0x0101010101010101, 0x0101010101010101 };
void TComb::checkOscillation5_SSE2(const uint8_t *p2p, const uint8_t *p1p, const uint8_t *s1p,
const uint8_t *n1p, const uint8_t *n2p, uint8_t *dstp, int stride, int width, int height, int thresh)
{
__asm
{
mov eax, p2p
mov ebx, p1p
mov edx, s1p
mov edi, n1p
mov esi, n2p
pxor xmm6, xmm6
dec thresh
movd xmm7, thresh
punpcklbw xmm7, xmm7
punpcklwd xmm7, xmm7
punpckldq xmm7, xmm7
punpcklqdq xmm7, xmm7
yloop :
xor ecx, ecx
align 16
xloop :
movdqa xmm0, [eax + ecx]
movdqa xmm2, [ebx + ecx]
movdqa xmm1, xmm0
movdqa xmm3, xmm2
pminub xmm0, [edx + ecx]
pmaxub xmm1, [edx + ecx]
pminub xmm2, [edi + ecx]
pmaxub xmm3, [edi + ecx]
pminub xmm0, [esi + ecx]
pmaxub xmm1, [esi + ecx]
movdqa xmm4, xmm3
movdqa xmm5, xmm1
psubusb xmm4, xmm2
psubusb xmm5, xmm0
psubusb xmm4, xmm7
psubusb xmm5, xmm7
psubusb xmm2, onesByte
psubusb xmm0, onesByte
psubusb xmm1, xmm2
psubusb xmm3, xmm0
pcmpeqb xmm1, xmm6
pcmpeqb xmm3, xmm6
pcmpeqb xmm4, xmm6
pcmpeqb xmm5, xmm6
mov eax, dstp
por xmm1, xmm3
pand xmm4, xmm5
pand xmm1, xmm4
movdqa[eax + ecx], xmm1
add ecx, 16
mov eax, p2p
cmp ecx, width
jl xloop
mov eax, stride
add ebx, stride
add p2p, eax
add edx, stride
add edi, stride
add dstp, eax
add esi, stride
mov eax, p2p
dec height
jnz yloop
}
}
独立版:
.xmm
.model flat,c
.data
align 16
onesByte qword 2 dup(0101010101010101h)
checkOscillation5_SSE2 proc uses ebx esi edi p2p:dword,p1p:dword,s1p:dword,n1p:dword,n2p:dword,dstp:dword,stride:dword,width_:dword,height:dword,thresh:dword
public checkOscillation5_SSE2
mov eax,p2p
mov ebx,p1p
mov edx,s1p
mov edi,n1p
mov esi,n2p
pxor xmm6,xmm6
dec thresh
movd xmm7,thresh
punpcklbw xmm7,xmm7
punpcklwd xmm7,xmm7
punpckldq xmm7,xmm7
punpcklqdq xmm7,xmm7
yloop:
xor ecx,ecx
align 16
xloop:
movdqa xmm0,[eax+ecx]
movdqa xmm2,[ebx+ecx]
movdqa xmm1,xmm0
movdqa xmm3,xmm2
pminub xmm0,[edx+ecx]
pmaxub xmm1,[edx+ecx]
pminub xmm2,[edi+ecx]
pmaxub xmm3,[edi+ecx]
pminub xmm0,[esi+ecx]
pmaxub xmm1,[esi+ecx]
movdqa xmm4,xmm3
movdqa xmm5,xmm1
psubusb xmm4,xmm2
psubusb xmm5,xmm0
psubusb xmm4,xmm7
psubusb xmm5,xmm7
psubusb xmm2,oword ptr onesByte
psubusb xmm0,oword ptr onesByte
psubusb xmm1,xmm2
psubusb xmm3,xmm0
pcmpeqb xmm1,xmm6
pcmpeqb xmm3,xmm6
pcmpeqb xmm4,xmm6
pcmpeqb xmm5,xmm6
mov eax,dstp
por xmm1,xmm3
pand xmm4,xmm5
pand xmm1,xmm4
movdqa [eax+ecx],xmm1
add ecx,16
mov eax,p2p
cmp ecx,width_
jl xloop
mov eax,stride
add ebx,stride
add p2p,eax
add edx,stride
add edi,stride
add dstp,eax
add esi,stride
mov eax,p2p
dec height
jnz yloop
ret
checkOscillation5_SSE2 endp
非常感谢任何有关此事的帮助或见解。