;------------------------------------------------------------------------------
;  @file
;  Copyright (C) 2020, vit9696. All rights reserved.
;  Copyright (C) 2006, Apple Computer, Inc. All rights reserved.
;
;  All rights reserved.
;
;  This program and the accompanying materials
;  are licensed and made available under the terms and conditions of the BSD License
;  which accompanies this distribution.  The full text of the license may be found at
;  http://opensource.org/licenses/bsd-license.php
;
;  THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
;  WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
;------------------------------------------------------------------------------

BITS    64
DEFAULT REL

;------------------------------------------------------------------------------
; The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with
; Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version.
;
; To generate the binary blob execute the following command:
; nasm LegacyBcopy.nasm -o /dev/stdout | xxd -i > LegacyBcopy.h
;
; The following #defines are tightly coupled to the u-architecture:
;------------------------------------------------------------------------------
%define kShort      80                  ; too short to bother with SSE (must be >=80)
%define kVeryLong   (500*1024)          ; large enough for non-temporal stores (>=8192 and <2GB)
%define kFastUCode  ((16*1024)-15)      ; cutoff for microcode fastpath for "rep/movsl"
%define COMM_PAGE_LONGCOPY 7FFFFFE01200h

;------------------------------------------------------------------------------
; void bcopy(const void *src, void *dst, size_t len)
; src, dst, len ~ rdi, rsi, rdx
;------------------------------------------------------------------------------
Lbcopy:
  push    rbp                      ; set up a frame for backtraces
  mov     rbp, rsp
  mov     rax, rsi                 ; copy dest ptr
  mov     rsi, rdi                 ; xchange source and dest ptrs
  mov     rdi, rax
  sub     rax, rsi                 ; (dest - source)
  cmp     rax, rdx                 ; must move in reverse if (dest - source) < length
  jb      short LReverseIsland
  cmp     rdx, kShort              ; long enough to bother with SSE?
  jbe     short LShort             ; no
  jmp     short LNotShort

;------------------------------------------------------------------------------
; void *memcpy(void *dst, const void *src, size_t len)
; void *memmove(void *dst, const void *src, size_t len)
;
; NB: These need to be 32 bytes from bcopy().
;------------------------------------------------------------------------------
align 32
Lmemcpy:
Lmemmove:
  push    rbp                      ; set up a frame for backtraces
  mov     rbp, rsp
  mov     r11, rdi                 ; save return value here
  mov     rax, rdi
  sub     rax, rsi                 ; (dest - source)
  cmp     rax, rdx                 ; must move in reverse if (dest - source) < length
  jb      short LReverseIsland
  cmp     rdx, kShort              ; long enough to bother with SSE?
  ja      short LNotShort          ; yes

;------------------------------------------------------------------------------
; Handle short forward copies.  As the most common case, this is the fall-through path.
;      rdx = length (<= kShort)
;      rsi = source ptr
;      rdi = dest ptr
;------------------------------------------------------------------------------
LShort:
  mov     ecx, edx                 ; copy length using 32-bit operation
  shr     ecx, 2                   ; get #doublewords
  jz      short LLeftovers

.cycle:                            ; loop copying doublewords
  mov     eax, [rsi]
  add     rsi, 4
  mov     [rdi], eax
  add     rdi, 4
  dec     ecx
  jnz     short .cycle

LLeftovers:                        ; handle leftover bytes (0..3) in last word
  and     edx, 3
  jz      short .skip              ; any leftover bytes?

.cycle:                            ; loop copying bytes
  mov     al, [rsi]
  inc     rsi
  mov     [rdi], al
  inc     rdi
  dec     edx
  jnz     short .cycle

.skip:
  mov     rax, r11                 ; get return value (dst ptr) for memcpy/memmove
  pop     rbp
  retn

LReverseIsland:                    ; keep the "jb" above a short branch...
  jmp     LReverse                 ; ...because reverse moves are uncommon

;------------------------------------------------------------------------------
; Handle forward moves that are long enough to justify use of SSE.
; First, 16-byte align the destination.
;      rdx = length (> kShort)
;      rsi = source ptr
;      rdi = dest ptr
;------------------------------------------------------------------------------
LNotShort:
  cmp     rdx, kVeryLong           ; long enough to justify heavyweight loops?
  jnb     short LVeryLong          ; use very-long-operand path
  mov     ecx, edi                 ; copy low half of destination ptr
  neg     ecx
  and     ecx, 15                  ; get #bytes to align destination
  jz      short LDestAligned       ; already aligned
  sub     edx, ecx                 ; decrement length
  rep movsb                        ; align destination

;------------------------------------------------------------------------------
; Destination is now aligned. Dispatch to the loops over 64-byte chunks,
; based on the alignment of the source. All vector loads and stores are aligned.
; Even though this means we have to shift and repack vectors, doing so is much faster
; than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
; there is at least one chunk. When we enter the copy loops, the following registers
; are set up:
;      rdx = residual length (0..63)
;      rcx = -(length to move), a multiple of 64 less than 2GB
;      rsi = ptr to 1st source byte not to move (unaligned)
;      rdi = ptr to 1st dest byte not to move (aligned)
;------------------------------------------------------------------------------
LDestAligned:
  mov     rcx, rdx                 ; copy length
  mov     eax, esi                 ; copy low half of source address
  and     edx, 63                  ; get remaining bytes for LShort
  and     rcx, -64                 ; get number of bytes we will copy in inner loop
  add     rsi, rcx                 ; point to 1st byte not copied
  add     rdi, rcx
  neg     rcx                      ; now generate offset to 1st byte to be copied
  ; Choose the loop. Without SSSE3 we only have two choices.
  ; 16-byte aligned loop (LMod0) and 1-byte unaligned loop (LMod1).
  and     eax, 15
  jz      short LMod0
  jmp     short LMod1

;------------------------------------------------------------------------------
; Very long forward moves.  These are at least several pages.  They are special cased
; and aggressively optimized, not so much because they are common or useful, but
; because they are subject to benchmark.  There isn't enough room for them in the
; area reserved on the commpage for bcopy, so we put them elsewhere.  We call
; the longcopy routine using the normal ABI:
;      rdi = dest
;      rsi = source
;      rdx = length (>= kVeryLong bytes)
;------------------------------------------------------------------------------
LVeryLong:
  push    r11                      ; save return value
  mov     rax, COMM_PAGE_LONGCOPY
  call    rax                      ; call very long operand routine
  pop     rax                      ; pop return value
  pop     rbp
  retn

;------------------------------------------------------------------------------
; On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte
; aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
; about 256 bytes up to kVeryLong for cold caches.  This is because the microcode
; avoids having to read destination cache lines that will be completely overwritten.
; The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
; we do not know if the destination is in cache or not.
;------------------------------------------------------------------------------
Lfastpath:
  add     rsi, rcx                 ; restore ptrs to 1st byte of source and dest
  add     rdi, rcx
  neg     ecx                      ; make length positive (known to be < 2GB)
  or      ecx, edx                 ; restore total #bytes remaining to move
  cld                              ; we'll move forward
  shr     ecx, 2                   ; compute #words to move
  rep movsd                        ; the u-code will optimize this
  jmp     LLeftovers               ; handle 0..3 leftover bytes

;------------------------------------------------------------------------------
; Forward loop for medium length operands in which low four bits of %rsi == 0000
;------------------------------------------------------------------------------
LMod0:
  cmp     ecx, -kFastUCode         ; %rcx == -length, where (length < kVeryLong)
  jle     short Lfastpath          ; long enough for fastpath in microcode
  jmp     short .loop

align 16                           ; 16-byte align inner loops
.loop:                             ; loop over 64-byte chunks
  movdqa  xmm0, oword [rsi+rcx]
  movdqa  xmm1, oword [rsi+rcx+10h]
  movdqa  xmm2, oword [rsi+rcx+20h]
  movdqa  xmm3, oword [rsi+rcx+30h]
  movdqa  oword [rdi+rcx], xmm0
  movdqa  oword [rdi+rcx+10h], xmm1
  movdqa  oword [rdi+rcx+20h], xmm2
  movdqa  oword [rdi+rcx+30h], xmm3
  add     rcx, 64
  jnz     short .loop
  jmp     LShort                    ; copy remaining 0..63 bytes and done

;------------------------------------------------------------------------------
; Forward loop for medium length operands in which low four bits of %rsi != 0000
;------------------------------------------------------------------------------
align 16
LMod1:
  movdqu  xmm0, oword [rsi+rcx]
  movdqu  xmm1, oword [rsi+rcx+10h]
  movdqu  xmm2, oword [rsi+rcx+20h]
  movdqu  xmm3, oword [rsi+rcx+30h]
  movdqa  oword [rdi+rcx], xmm0
  movdqa  oword [rdi+rcx+10h], xmm1
  movdqa  oword [rdi+rcx+20h], xmm2
  movdqa  oword [rdi+rcx+30h], xmm3
  add     rcx, 64
  jnz     short LMod1
  jmp     LShort                    ; copy remaining 0..63 bytes and done

;------------------------------------------------------------------------------
; Reverse moves.  These are not optimized as aggressively as their forward
; counterparts, as they are only used with destructive overlap.
;      rdx = length
;      rsi = source ptr
;      rdi = dest ptr
;------------------------------------------------------------------------------
LReverse:
  add     rsi, rdx                  ; point to end of strings
  add     rdi, rdx
  cmp     rdx, kShort               ; long enough to bother with SSE?
  ja      short LReverseNotShort    ; yes

;------------------------------------------------------------------------------
; Handle reverse short copies.
;      edx = length (<= kShort)
;      rsi = one byte past end of source
;      rdi = one byte past end of dest
;------------------------------------------------------------------------------
LReverseShort:
  mov     ecx, edx                  ; copy length
  shr     ecx, 3                    ; #quadwords
  jz      short .l2

.l1:
  sub     rsi, 8
  mov     rax, [rsi]
  sub     rdi, 8
  mov     [rdi], rax
  dec     ecx
  jnz     short .l1

.l2:
  and     edx, 7                    ; bytes?
  jz      short .l4

.l3:
  dec     rsi
  mov     al, [rsi]
  dec     rdi
  mov     [rdi], al
  dec     edx
  jnz     short .l3

.l4:
  mov     rax, r11                  ; get return value (dst ptr) for memcpy/memmove
  pop     rbp
  ret

;------------------------------------------------------------------------------
; Handle a reverse move long enough to justify using SSE.
;      rdx = length (> kShort)
;      rsi = one byte past end of source
;      rdi = one byte past end of dest
;------------------------------------------------------------------------------
LReverseNotShort:
  mov     ecx, edi                  ; copy destination
  and     ecx, 15                   ; get #bytes to align destination
  jz      short LReverseDestAligned ; already aligned
  sub     rdx, rcx                  ; adjust length
.cycle:                             ; loop copying 1..15 bytes
  dec     rsi
  mov     al, [rsi]
  dec     rdi
  mov     [rdi], al
  dec     ecx
  jnz     short .cycle

;------------------------------------------------------------------------------
; Destination is now aligned.  Prepare for reverse loops.
;------------------------------------------------------------------------------
LReverseDestAligned:
  mov     rcx, rdx                  ; copy length
  and     edx, 63                   ; get remaining bytes for LReverseShort
  and     rcx, -64                  ; get number of bytes we will copy in inner loop
  sub     rsi, rcx                  ; point to endpoint of copy
  sub     rdi, rcx
  test    esi, 15                   ; is source aligned too?
  jnz     short LReverseUnalignedLoop

;------------------------------------------------------------------------------
; Reverse loop over 64-byte aligned chunks.
;------------------------------------------------------------------------------
LReverseAlignedLoop:
  movdqa  xmm0, oword [rsi+rcx-16]
  movdqa  xmm1, oword [rsi+rcx-32]
  movdqa  xmm2, oword [rsi+rcx-48]
  movdqa  xmm3, oword [rsi+rcx-64]
  movdqa  oword [rdi+rcx-16], xmm0
  movdqa  oword [rdi+rcx-32], xmm1
  movdqa  oword [rdi+rcx-48], xmm2
  movdqa  oword [rdi+rcx-64], xmm3
  sub     rcx, 64
  jnz     short LReverseAlignedLoop
  jmp     LReverseShort             ; copy remaining 0..63 bytes and done

;------------------------------------------------------------------------------
; Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
;------------------------------------------------------------------------------
LReverseUnalignedLoop:
  movdqu  xmm0, oword [rsi+rcx-16]
  movdqu  xmm1, oword [rsi+rcx-32]
  movdqu  xmm2, oword [rsi+rcx-48]
  movdqu  xmm3, oword [rsi+rcx-64]
  movdqa  oword [rdi+rcx-16], xmm0
  movdqa  oword [rdi+rcx-32], xmm1
  movdqa  oword [rdi+rcx-48], xmm2
  movdqa  oword [rdi+rcx-64], xmm3
  sub     rcx, 64
  jnz     short LReverseUnalignedLoop
  jmp     LReverseShort             ; copy remaining 0..63 bytes and done