338 lines
14 KiB
NASM

;------------------------------------------------------------------------------
; @file
; Copyright (C) 2020, vit9696. All rights reserved.
; Copyright (C) 2006, Apple Computer, Inc. All rights reserved.
;
; All rights reserved.
;
; This program and the accompanying materials
; are licensed and made available under the terms and conditions of the BSD License
; which accompanies this distribution. The full text of the license may be found at
; http://opensource.org/licenses/bsd-license.php
;
; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
;------------------------------------------------------------------------------
BITS 64
DEFAULT REL
;------------------------------------------------------------------------------
; The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
; Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version.
;
; To generate the binary blob execute the following command:
; nasm LegacyBcopy.nasm -o /dev/stdout | xxd -i > LegacyBcopy.h
;
; The following #defines are tightly coupled to the u-architecture:
;------------------------------------------------------------------------------
%define kShort 80 ; too short to bother with SSE (must be >=80)
%define kVeryLong (500*1024) ; large enough for non-temporal stores (>=8192 and <2GB)
%define kFastUCode ((16*1024)-15) ; cutoff for microcode fastpath for "rep/movsl"
%define COMM_PAGE_LONGCOPY 7FFFFFE01200h
;------------------------------------------------------------------------------
; void bcopy(const void *src, void *dst, size_t len)
; src, dst, len ~ rdi, rsi, rdx
;------------------------------------------------------------------------------
Lbcopy:
push rbp ; set up a frame for backtraces
mov rbp, rsp
mov rax, rsi ; copy dest ptr
mov rsi, rdi ; xchange source and dest ptrs
mov rdi, rax
sub rax, rsi ; (dest - source)
cmp rax, rdx ; must move in reverse if (dest - source) < length
jb short LReverseIsland
cmp rdx, kShort ; long enough to bother with SSE?
jbe short LShort ; no
jmp short LNotShort
;------------------------------------------------------------------------------
; void *memcpy(void *dst, const void *src, size_t len)
; void *memmove(void *dst, const void *src, size_t len)
;
; NB: These need to be 32 bytes from bcopy().
;------------------------------------------------------------------------------
align 32
Lmemcpy:
Lmemmove:
push rbp ; set up a frame for backtraces
mov rbp, rsp
mov r11, rdi ; save return value here
mov rax, rdi
sub rax, rsi ; (dest - source)
cmp rax, rdx ; must move in reverse if (dest - source) < length
jb short LReverseIsland
cmp rdx, kShort ; long enough to bother with SSE?
ja short LNotShort ; yes
;------------------------------------------------------------------------------
; Handle short forward copies. As the most common case, this is the fall-through path.
; rdx = length (<= kShort)
; rsi = source ptr
; rdi = dest ptr
;------------------------------------------------------------------------------
LShort:
mov ecx, edx ; copy length using 32-bit operation
shr ecx, 2 ; get #doublewords
jz short LLeftovers
.cycle: ; loop copying doublewords
mov eax, [rsi]
add rsi, 4
mov [rdi], eax
add rdi, 4
dec ecx
jnz short .cycle
LLeftovers: ; handle leftover bytes (0..3) in last word
and edx, 3
jz short .skip ; any leftover bytes?
.cycle: ; loop copying bytes
mov al, [rsi]
inc rsi
mov [rdi], al
inc rdi
dec edx
jnz short .cycle
.skip:
mov rax, r11 ; get return value (dst ptr) for memcpy/memmove
pop rbp
retn
LReverseIsland: ; keep the "jb" above a short branch...
jmp LReverse ; ...because reverse moves are uncommon
;------------------------------------------------------------------------------
; Handle forward moves that are long enough to justify use of SSE.
; First, 16-byte align the destination.
; rdx = length (> kShort)
; rsi = source ptr
; rdi = dest ptr
;------------------------------------------------------------------------------
LNotShort:
cmp rdx, kVeryLong ; long enough to justify heavyweight loops?
jnb short LVeryLong ; use very-long-operand path
mov ecx, edi ; copy low half of destination ptr
neg ecx
and ecx, 15 ; get #bytes to align destination
jz short LDestAligned ; already aligned
sub edx, ecx ; decrement length
rep movsb ; align destination
;------------------------------------------------------------------------------
; Destination is now aligned. Dispatch to the loops over 64-byte chunks,
; based on the alignment of the source. All vector loads and stores are aligned.
; Even though this means we have to shift and repack vectors, doing so is much faster
; than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
; there is at least one chunk. When we enter the copy loops, the following registers
; are set up:
; rdx = residual length (0..63)
; rcx = -(length to move), a multiple of 64 less than 2GB
; rsi = ptr to 1st source byte not to move (unaligned)
; rdi = ptr to 1st dest byte not to move (aligned)
;------------------------------------------------------------------------------
LDestAligned:
mov rcx, rdx ; copy length
mov eax, esi ; copy low half of source address
and edx, 63 ; get remaining bytes for LShort
and rcx, -64 ; get number of bytes we will copy in inner loop
add rsi, rcx ; point to 1st byte not copied
add rdi, rcx
neg rcx ; now generate offset to 1st byte to be copied
; Choose the loop. Without SSSE3 we only have two choices.
; 16-byte aligned loop (LMod0) and 1-byte unaligned loop (LMod1).
and eax, 15
jz short LMod0
jmp short LMod1
;------------------------------------------------------------------------------
; Very long forward moves. These are at least several pages. They are special cased
; and aggressively optimized, not so much because they are common or useful, but
; because they are subject to benchmark. There isn't enough room for them in the
; area reserved on the commpage for bcopy, so we put them elsewhere. We call
; the longcopy routine using the normal ABI:
; rdi = dest
; rsi = source
; rdx = length (>= kVeryLong bytes)
;------------------------------------------------------------------------------
LVeryLong:
push r11 ; save return value
mov rax, COMM_PAGE_LONGCOPY
call rax ; call very long operand routine
pop rax ; pop return value
pop rbp
retn
;------------------------------------------------------------------------------
; On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
; aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
; about 256 bytes up to kVeryLong for cold caches. This is because the microcode
; avoids having to read destination cache lines that will be completely overwritten.
; The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
; we do not know if the destination is in cache or not.
;------------------------------------------------------------------------------
Lfastpath:
add rsi, rcx ; restore ptrs to 1st byte of source and dest
add rdi, rcx
neg ecx ; make length positive (known to be < 2GB)
or ecx, edx ; restore total #bytes remaining to move
cld ; we'll move forward
shr ecx, 2 ; compute #words to move
rep movsd ; the u-code will optimize this
jmp LLeftovers ; handle 0..3 leftover bytes
;------------------------------------------------------------------------------
; Forward loop for medium length operands in which low four bits of %rsi == 0000
;------------------------------------------------------------------------------
LMod0:
cmp ecx, -kFastUCode ; %rcx == -length, where (length < kVeryLong)
jle short Lfastpath ; long enough for fastpath in microcode
jmp short .loop
align 16 ; 16-byte align inner loops
.loop: ; loop over 64-byte chunks
movdqa xmm0, oword [rsi+rcx]
movdqa xmm1, oword [rsi+rcx+10h]
movdqa xmm2, oword [rsi+rcx+20h]
movdqa xmm3, oword [rsi+rcx+30h]
movdqa oword [rdi+rcx], xmm0
movdqa oword [rdi+rcx+10h], xmm1
movdqa oword [rdi+rcx+20h], xmm2
movdqa oword [rdi+rcx+30h], xmm3
add rcx, 64
jnz short .loop
jmp LShort ; copy remaining 0..63 bytes and done
;------------------------------------------------------------------------------
; Forward loop for medium length operands in which low four bits of %rsi != 0000
;------------------------------------------------------------------------------
align 16
LMod1:
movdqu xmm0, oword [rsi+rcx]
movdqu xmm1, oword [rsi+rcx+10h]
movdqu xmm2, oword [rsi+rcx+20h]
movdqu xmm3, oword [rsi+rcx+30h]
movdqa oword [rdi+rcx], xmm0
movdqa oword [rdi+rcx+10h], xmm1
movdqa oword [rdi+rcx+20h], xmm2
movdqa oword [rdi+rcx+30h], xmm3
add rcx, 64
jnz short LMod1
jmp LShort ; copy remaining 0..63 bytes and done
;------------------------------------------------------------------------------
; Reverse moves. These are not optimized as aggressively as their forward
; counterparts, as they are only used with destructive overlap.
; rdx = length
; rsi = source ptr
; rdi = dest ptr
;------------------------------------------------------------------------------
LReverse:
add rsi, rdx ; point to end of strings
add rdi, rdx
cmp rdx, kShort ; long enough to bother with SSE?
ja short LReverseNotShort ; yes
;------------------------------------------------------------------------------
; Handle reverse short copies.
; edx = length (<= kShort)
; rsi = one byte past end of source
; rdi = one byte past end of dest
;------------------------------------------------------------------------------
LReverseShort:
mov ecx, edx ; copy length
shr ecx, 3 ; #quadwords
jz short .l2
.l1:
sub rsi, 8
mov rax, [rsi]
sub rdi, 8
mov [rdi], rax
dec ecx
jnz short .l1
.l2:
and edx, 7 ; bytes?
jz short .l4
.l3:
dec rsi
mov al, [rsi]
dec rdi
mov [rdi], al
dec edx
jnz short .l3
.l4:
mov rax, r11 ; get return value (dst ptr) for memcpy/memmove
pop rbp
ret
;------------------------------------------------------------------------------
; Handle a reverse move long enough to justify using SSE.
; rdx = length (> kShort)
; rsi = one byte past end of source
; rdi = one byte past end of dest
;------------------------------------------------------------------------------
LReverseNotShort:
mov ecx, edi ; copy destination
and ecx, 15 ; get #bytes to align destination
jz short LReverseDestAligned ; already aligned
sub rdx, rcx ; adjust length
.cycle: ; loop copying 1..15 bytes
dec rsi
mov al, [rsi]
dec rdi
mov [rdi], al
dec ecx
jnz short .cycle
;------------------------------------------------------------------------------
; Destination is now aligned. Prepare for reverse loops.
;------------------------------------------------------------------------------
LReverseDestAligned:
mov rcx, rdx ; copy length
and edx, 63 ; get remaining bytes for LReverseShort
and rcx, -64 ; get number of bytes we will copy in inner loop
sub rsi, rcx ; point to endpoint of copy
sub rdi, rcx
test esi, 15 ; is source aligned too?
jnz short LReverseUnalignedLoop
;------------------------------------------------------------------------------
; Reverse loop over 64-byte aligned chunks.
;------------------------------------------------------------------------------
LReverseAlignedLoop:
movdqa xmm0, oword [rsi+rcx-16]
movdqa xmm1, oword [rsi+rcx-32]
movdqa xmm2, oword [rsi+rcx-48]
movdqa xmm3, oword [rsi+rcx-64]
movdqa oword [rdi+rcx-16], xmm0
movdqa oword [rdi+rcx-32], xmm1
movdqa oword [rdi+rcx-48], xmm2
movdqa oword [rdi+rcx-64], xmm3
sub rcx, 64
jnz short LReverseAlignedLoop
jmp LReverseShort ; copy remaining 0..63 bytes and done
;------------------------------------------------------------------------------
; Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
;------------------------------------------------------------------------------
LReverseUnalignedLoop:
movdqu xmm0, oword [rsi+rcx-16]
movdqu xmm1, oword [rsi+rcx-32]
movdqu xmm2, oword [rsi+rcx-48]
movdqu xmm3, oword [rsi+rcx-64]
movdqa oword [rdi+rcx-16], xmm0
movdqa oword [rdi+rcx-32], xmm1
movdqa oword [rdi+rcx-48], xmm2
movdqa oword [rdi+rcx-64], xmm3
sub rcx, 64
jnz short LReverseUnalignedLoop
jmp LReverseShort ; copy remaining 0..63 bytes and done