mov rcx, 16 ; get first word, up to 16 bytes
mov rdi, CMD ; ...and put it in CMD
mov rsi, CMD_BLOCK
@@: lodsb
cmp al, 0x20
je @f
loop @b
@@: mov rsi, CMD ;
movdqa xmm0, [rsi] ; mov cmd into xmm0
答案 0 :(得分:3)
这甚至不是写一个一次一个字节的复制循环的最有效方法。除非您专门针对the few architectures where it's not slow (e.g. AMD Bulldozer)之一进行调整,否则切勿使用LOOP指令。请参阅Agner Fog的内容以及x86标记wiki中的其他链接。或者通过C intrinsics使用SSE / AVX,让编译器生成实际的asm。
我假设您在开始复制之前将16B CMD缓冲区归零,否则您可能只需执行未对齐的加载并获取除您想要的数据之外的任何垃圾字节。
section .rodata
ALIGN 32 ; No cache-line splits when taking an unaligned 16B window on these 32 bytes
dd -1, -1, -1, -1
dd 0, 0, 0, 0
end_pattern: times 16 db 0x20 ; pre-broadcast the byte to compare against (or generate it on the fly)
section .text
... as part of some function ...
movdqu xmm0, [CMD_BLOCK] ; you don't have to waste instructions putting pointers in registers.
movdqa xmm1, [end_pattern] ; or hoist this load out of a loop
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
bsr eax, eax ; number of bytes of the vector to keep
jz @no_match ; bsr is weird when input is 0 :(
neg rax ; go back this far into the all-ones bytes
movdqu xmm1, [zeroing_mask + rax] ; take a window of 16 bytes
pand xmm0, xmm1
@no_match: ; all bytes are valid, no masking needed
;; XMM0 holds bytes from [CMD_BLOCK], up to but not including the first 0x20.
在Intel Haswell上,从输入到PCMPEQB的准备工作应该有大约11c的延迟,直到PAND的输出就绪。
如果您可以使用LZCNT而不是BSR,则可以避免使用分支。您。因为我们想要在无匹配情况下使用16(所以neg eax给出-16,并且我们加载了一个全向的向量),16位LZCNT就能做到这一点。 (lzcnt ax, ax
开始为零。否则xor ecx, ecx
/ lzcnt cx, ax
这个蒙版生成的想法带有一个未对齐的加载来获取一些全部和全部为零的窗口,这与我在Vectorizing with unaligned buffers: using VMASKMOVPS: generating a mask from a misalignment count? Or not using that insn at all上的一个答案相同。
movdqu xmm0, [CMD_BLOCK]
movdqa xmm1, [end_pattern]
pcmpeqb xmm1, xmm0 ; 0 0 ... -1 ?? ?? ...
movdqa xmm2, xmm1
pslldq xmm2, 1
por xmm1, xmm2 ; 0 0 ... -1 -1 ?? ...
movdqa xmm2, xmm1
pslldq xmm2, 2
por xmm1, xmm2 ; 0 0 ... -1 -1 -1 -1 ?? ...
pshufd xmm2, xmm1, 0b10010000 ; [ a b c d ] -> [ a a b c ]
por xmm1, xmm2 ; 0 0 ... -1 -1 -1 -1 -1 -1 -1 -1 ?? ... (8-wide)
pshufd xmm2, xmm1, 0b01000000 ; [ abcd ] -> [ aaab ]
por xmm1, xmm2 ; 0 0 ... -1 (all the way to the end, no ?? elements left)
;; xmm1 = the same mask the other version loads with movdqu based on the index of the first match
pandn xmm1, xmm0 ; xmm1 = [CMD_BLOCK] with upper bytes zeroed
;; pshufd instead of copy + vector shift works:
;; [ abcd efgh hijk lmno ]
;; [ abcd abcd efgh hijk ] ; we're ORing together so it's ok that the first 4B are still there instead of zeroed.
如果你对你的终结符进行异或运算使得0x20字节变为0x00字节,你可能可以使用SSE4.2字符串指令,因为它们已经被设置为处理隐式长度字符串,其中超过0x00的所有字节都是无效。请参阅this tutorial/example,因为英特尔的文档只是详细记录所有内容,而不是首先关注重要内容。
;; PCMPISTRM imm8:
;; imm8[1:0] = 00 = unsigned bytes
;; imm8[3:2] = 10 = equals each, vertical comparison. (always not-equal since we're comparing the orig vector with one where we XORed the match byte)
;; imm8[5:4] = 11 = masked(-): inverted for valid bytes, but not for invalid (TODO: get the logic on this and PAND vs. PANDN correct)
;; imm8[6] = 1 = output selection (byte mask, not bit mask)
;; imm8[7] = 0 (reserved. Holy crap, this instruction has room to encode even more functionality??)
movdqu xmm1, [CMD_BLOCK]
movdqa xmm2, xmm1
pxor xmm2, [end_pattern] ; turn the stop-character into 0x00 so it looks like an implicit-length string
; also creating a vector where every byte is different from xmm1, so we get guaranteed results for the "valid" part of the vectors (unless the input string can contain 0x0 bytes)
pcmpistrm xmm1, xmm2, 0b01111000 ; implicit destination operand: XMM0
pand xmm0, xmm1