2022-03-06 11:49:16 +03:00

513 lines
19 KiB
NASM

; @file
; Copyright (C) 2013 Intel Corporation. All rights reserved.
; Copyright (C) 2021, ISP RAS. All rights reserved.
;
; This program and the accompanying materials
; are licensed and made available under the terms and conditions of the BSD License
; which accompanies this distribution. The full text of the license may be found at
; http://opensource.org/licenses/bsd-license.php
;
; THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,
; WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.
;
; #######################################################################
;
; This code is described in an Intel White-Paper:
; "Fast SHA-512 Implementations on Intel Architecture Processors"
;
; ########################################################################
; ### Binary Data
BITS 64
extern ASM_PFX(SHA512_K)
extern ASM_PFX(mIsAccelEnabled)
section .rodata
align 16
; Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
XMM_QWORD_BSWAP:
dq 0x0001020304050607,0x08090a0b0c0d0e0f
; ########################################################################
; ### Code
section .text
; Virtual Registers
; ARG1
; rcx == sha512_state *state
%define digest rcx
; ARG2
; rdx == const u8 *data
%define msg rdx
; ARG3
; r8 == int blocks
%define msglen r8
%define T1 rdi
%define T2 rbx
%define a_64 rsi
%define b_64 r9
%define c_64 r10
%define d_64 r11
%define e_64 r12
%define f_64 r13
%define g_64 r14
%define h_64 r15
%define tmp0 rax
; Local variables (stack frame)
; Message Schedule
%define W_SIZE 80*8
; W[t] + K[t] | W[t+1] + K[t+1]
%define WK_SIZE 2*8
%define RSPSAVE_SIZE 1*8
%define GPRSAVE_SIZE 8*8+5*16
%define frame_W 0
%define frame_WK frame_W + W_SIZE
%define frame_RSPSAVE frame_WK + WK_SIZE
%define frame_GPRSAVE frame_RSPSAVE + RSPSAVE_SIZE
%define frame_size frame_GPRSAVE + GPRSAVE_SIZE
; Useful QWORD "arrays" for simpler memory references
; MSG, DIGEST, K_t, W_t are arrays
; WK_2(t) points to 1 of 2 qwords at frame.WK depdending on t being odd/even
; Output Digest (arg1)
%define DIGEST(i) [digest + 8*i]
; Input message (arg2)
%define MSG(i) [msg + 8*i]
; SHA Constants (static mem)
%define K_t(i) [rel 8*i + ASM_PFX(SHA512_K)]
; Message Schedule (stack frame)
%define W_t(i) [rsp + 8*i + frame_W]
; W[t]+K[t] (stack frame)
%define WK_2(i) [rsp + 8*((i % 2)) + frame_WK]
%macro RotateState 0
; Rotate symbols a..h right
%xdefine TMP h_64
%xdefine h_64 g_64
%xdefine g_64 f_64
%xdefine f_64 e_64
%xdefine e_64 d_64
%xdefine d_64 c_64
%xdefine c_64 b_64
%xdefine b_64 a_64
%xdefine a_64 TMP
%endmacro
%macro RORQ 2
; shld is faster than ror on Sandybridge
shld %1, %1, (64-%2)
%endmacro
%macro SHA512_Round_Optimized 1
mov T1, f_64 ; T1 = f
mov tmp0, e_64 ; tmp = e
xor T1, g_64 ; T1 = f ^ g
RORQ tmp0, 23 ; 41 ; tmp = e ror 23
and T1, e_64 ; T1 = (f ^ g) & e
xor tmp0, e_64 ; tmp = (e ror 23) ^ e
xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
%assign idx %1
add T1, WK_2(idx) ; W[t] + K[t] from message scheduler
RORQ tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
mov T2, a_64 ; T2 = a
add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
RORQ tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
mov tmp0, a_64 ; tmp = a
xor T2, c_64 ; T2 = a ^ c
and tmp0, c_64 ; tmp = a & c
and T2, b_64 ; T2 = (a ^ c) & b
xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
mov tmp0, a_64 ; tmp = a
RORQ tmp0, 5 ; 39 ; tmp = a ror 5
xor tmp0, a_64 ; tmp = (a ror 5) ^ a
add d_64, T1 ; e(next_state) = d + T1
RORQ tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
RORQ tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
RotateState
%endmacro
; Compute Round t
%macro SHA512_Round 1
; Ch(e,f,g) = (e & f) ^ (~e & g)
mov T1, e_64 ; T1 = e
and T1, f_64 ; T1 = e & f
mov tmp0, e_64 ; tmp0 = e
not tmp0 ; tmp0 = ~e
and tmp0, g_64 ; tmp0 = ~e & g
xor T1, tmp0 ; T1 = (e & f) ^ (~e & g)
; Sigma[1,512](e) = (e ROTR 14) ^ (e ROTR 18) ^ (e ROTR 41)
mov tmp0, e_64 ; tmp0 = e
RORQ tmp0, 14 ; tmp0 = e ROTR 14
mov T2, e_64 ; T2 = e
RORQ T2, 18 ; T2 = e ROTR 18
xor tmp0, T2 ; tmp0 = (e ROTR 14) ^ (e ROTR 18)
RORQ T2, 23 ; T2 = e ROTR 41
xor tmp0, T2 ; tmp0 = (e ROTR 14) ^ (e ROTR 18) ^ (e ROTR 41)
; T1 = h + Sigma[1,512](e) + Ch(e,f,g) + K[t] + W[t]
add T1, tmp0 ; T1 = Ch(e,f,g) + Sigma[1,512](e)
%assign idx %1
add T1, WK_2(idx) ; T1 = Ch(e,f,g) + Sigma[1,512](e) + W[t] + K[t]
add T1, h_64 ; T1 = Ch(e,f,g) + Sigma[1,512](e) + W[t] + K[t] + h
; Maj(a,b,c) = (a & b) ^ (a & c) ^ (b & c)
mov T2, a_64 ; T2 = a
and T2, b_64 ; T2 = a & b
mov tmp0, a_64 ; tmp0 = a
and tmp0, c_64 ; tmp0 = a & c
xor T2, tmp0 ; T2 = (a & b) ^ (a & c)
mov tmp0, b_64 ; tmp0 = b
and tmp0, c_64 ; tmp0 = b & c
xor T2, tmp0 ; T2 = (a & b) ^ (a & c) ^ (b & c)
RotateState ; a = h, b = a, c = b, d = c, e = d, f = e, g = f, h = g
add e_64, T1 ; e = d + T1
mov a_64, T1 ; a = T1
; Sigma[0,512](a) = (a ROTR 28) ^ (a ROTR 34) ^ (a ROTR 39)
mov T1, b_64 ; T1 = a, because now b == a
RORQ T1, 28 ; T1 = a ROTR 28
mov tmp0, b_64 ; tmp0 = a
RORQ tmp0, 34 ; tmp0 = a ROTR 34
xor T1, tmp0 ; T1 = (a ROTR 28) ^ (a ROTR 34)
RORQ tmp0, 5 ; tmp0 = a ROTR 39
xor T1, tmp0 ; T1 = (a ROTR 28) ^ (a ROTR 34) ^ (a ROTR 39)
; T2 = Sigma[0,512](a) + Maj(a,b,c)
add T2, T1 ; T2 = Maj(a,b,c) + Sigma[0,512](a)
add a_64, T2 ; a = T1 + T2
%%showdigest:
%endmacro
%macro SHA512_Stitched 1
; Compute rounds t-2 and t-1
; Compute message schedule QWORDS t and t+1
; Two rounds are computed based on the values for K[t-2]+W[t-2] and
; K[t-1]+W[t-1], which were previously stored at WK_2 by the message scheduler.
; The two new schedule QWORDS are stored at [W_t(t)] and [W_t(t+1)].
; They are then added to their respective SHA512 constants at
; [K_t(t)] and [K_t(t+1)] and stored at dqword [WK_2(t)].
; The computation of the message schedule and the rounds are tightly
; stitched to take advantage of instruction-level parallelism.
%assign idx (%1 - 2)
vmovdqu xmm4, W_t(idx) ; xmm4 = W[t-2]|W[t-1]
mov T1, f_64 ; T1 = f
mov tmp0, e_64 ; tmp = e
vpsrlq xmm0, xmm4, 19 ; xmm0 = W[t-2] >> 19
xor T1, g_64 ; T1 = f ^ g
RORQ tmp0, 23 ; 41 ; tmp = e ror 23
vpsllq xmm1, xmm4, (64-19) ; xmm1 = W[t-2] << 64-19
and T1, e_64 ; T1 = (f ^ g) & e
xor tmp0, e_64 ; tmp = (e ror 23) ^ e
vpor xmm0, xmm0, xmm1 ; xmm0 = (W[t-2] >> 19) | (W[t-2] << 64-19)
xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
%assign idxR (%1 - 2)
add T1, WK_2(idxR) ; W[t] + K[t] from message scheduler
vpsrlq xmm2, xmm4, 61 ; xmm2 = W[t-2] >> 61
RORQ tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
vpsllq xmm1, xmm4, (64-61) ; xmm1 = W[t-2] << 64-19
mov T2, a_64 ; T2 = a
add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
vpor xmm2, xmm2, xmm1 ; xmm2 = (W[t-2] >> 61) | (W[t-2] << 64-61)
RORQ tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
vpxor xmm0, xmm0, xmm2 ; xmm0 = (W[t-2] ROTR 19) ^ (W[t-2] ROTR 61)
mov tmp0, a_64 ; tmp = a
xor T2, c_64 ; T2 = a ^ c
vpsrlq xmm2, xmm4, 6 ; xmm2 = W[t-2] >> 6
and tmp0, c_64 ; tmp = a & c
and T2, b_64 ; T2 = (a ^ c) & b
vpxor xmm0, xmm0, xmm2 ; xmm0 = (W[t-2] ROTR 19) ^ (W[t-2] ROTR 61) ^ (W[t-2] SHR 6)
xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
mov tmp0, a_64 ; tmp = a
%assign idx (%1 - 15)
vmovdqu xmm4, W_t(idx) ; xmm4 = W[t-15]|W[t-14]
RORQ tmp0, 5 ; 39 ; tmp = a ror 5
xor tmp0, a_64 ; tmp = (a ror 5) ^ a
vpsrlq xmm1, xmm4, 1 ; xmm1 = W[t-15] >> 1
add d_64, T1 ; e(next_state) = d + T1
RORQ tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
vpsllq xmm2, xmm4, (64-1) ; xmm2 = W[t-15] << 64-1
xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
vpor xmm1, xmm1, xmm2 ; xmm1 = (W[t-15] >> 1) | (W[t-15] << 64-1)
RORQ tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
RotateState
vpsrlq xmm3, xmm4, 8 ; xmm3 = W[t-15] >> 8
mov T1, f_64 ; T1 = f
mov tmp0, e_64 ; tmp = e
vpsllq xmm2, xmm4, (64-8) ; xmm2 = W[t-15] << 64-8
xor T1, g_64 ; T1 = f ^ g
RORQ tmp0, 23 ; 41 ; tmp = e ror 23
vpor xmm3, xmm3, xmm2 ; xmm3 = (W[t-15] >> 8) | (W[t-15] << 64-8)
and T1, e_64 ; T1 = (f ^ g) & e
xor tmp0, e_64 ; tmp = (e ror 23) ^ e
vpxor xmm1, xmm1, xmm3 ; xmm1 = (W[t-15] ROTR 1) ^ (W[t-15] ROTR 8)
xor T1, g_64 ; T1 = ((f ^ g) & e) ^ g = CH(e,f,g)
%assign idxR (%1 - 1)
add T1, WK_2(idxR) ; W[t] + K[t] from message scheduler
vpsrlq xmm3, xmm4, 7 ; xmm3 = W[t-15] >> 7
RORQ tmp0, 4 ; 18 ; tmp = ((e ror 23) ^ e) ror 4
xor tmp0, e_64 ; tmp = (((e ror 23) ^ e) ror 4) ^ e
vpxor xmm1, xmm1, xmm3 ; xmm1 = (W[t-15] ROTR 1) ^ (W[t-15] ROTR 8) ^ (W[t-15] SHR 7)
mov T2, a_64 ; T2 = a
add T1, h_64 ; T1 = CH(e,f,g) + W[t] + K[t] + h
%assign idx (%1 - 7)
vpaddq xmm0, xmm0, W_t(idx) ; xmm0 = sigma[1,512](W[t-2]) + W[t-7]
RORQ tmp0, 14 ; 14 ; tmp = ((((e ror23)^e)ror4)^e)ror14 = S1(e)
add T1, tmp0 ; T1 = CH(e,f,g) + W[t] + K[t] + S1(e)
vpaddq xmm0, xmm0, xmm1 ; xmm0 = sigma[1,512](W[t-2]) + W[t-7] + sigma[0,512](W[t-15])
mov tmp0, a_64 ; tmp = a
xor T2, c_64 ; T2 = a ^ c
%assign idx (%1 - 16)
vpaddq xmm0, xmm0, W_t(idx) ; xmm0 = sigma[1,512](W[t-2]) + W[t-7] + sigma[0,512](W[t-15]) + W[t-16]
and tmp0, c_64 ; tmp = a & c
and T2, b_64 ; T2 = (a ^ c) & b
%assign idx %1
vmovdqa W_t(idx), xmm0 ; Store W[t]
xor T2, tmp0 ; T2 = ((a ^ c) & b) ^ (a & c) = Maj(a,b,c)
mov tmp0, a_64 ; tmp = a
vpaddq xmm0, xmm0, K_t(idx) ; Compute W[t]+K[t]
RORQ tmp0, 5 ; 39 ; tmp = a ror 5
xor tmp0, a_64 ; tmp = (a ror 5) ^ a
vmovdqa WK_2(idx), xmm0 ; Store W[t]+K[t] for next rounds
add d_64, T1 ; e(next_state) = d + T1
RORQ tmp0, 6 ; 34 ; tmp = ((a ror 5) ^ a) ror 6
xor tmp0, a_64 ; tmp = (((a ror 5) ^ a) ror 6) ^ a
lea h_64, [T1 + T2] ; a(next_state) = T1 + Maj(a,b,c)
RORQ tmp0, 28 ; 28 ; tmp = ((((a ror5)^a)ror6)^a)ror28 = S0(a)
add h_64, tmp0 ; a(next_state) = T1 + Maj(a,b,c) S0(a)
RotateState
%endmacro
; Compute message schedules t and t+1
%macro SHA512_2Sched 1
; x ROTR n = (x >> n) | (x << 64-n)
; sigma[1,512](W[t-2]) = (W[t-2] ROTR 19) ^ (W[t-2] ROTR 61) ^ (W[t-2] SHR 6)
%assign idx (%1 - 2)
; W[t-2] ROTR 19
vmovdqu xmm4, W_t(idx) ; xmm4 = W[t-2]|W[t-1]
vpsrlq xmm0, xmm4, 19 ; xmm0 = W[t-2] >> 19
vpsllq xmm1, xmm4, (64-19) ; xmm1 = W[t-2] << 64-19
vpor xmm0, xmm0, xmm1 ; xmm0 = (W[t-2] >> 19) | (W[t-2] << 64-19)
; W[t-2] ROTR 61
vpsrlq xmm2, xmm4, 61 ; xmm2 = W[t-2] >> 61
vpsllq xmm1, xmm4, (64-61) ; xmm1 = W[t-2] << 64-19
vpor xmm2, xmm2, xmm1 ; xmm2 = (W[t-2] >> 61) | (W[t-2] << 64-61)
vpxor xmm0, xmm0, xmm2 ; xmm0 = (W[t-2] ROTR 19) ^ (W[t-2] ROTR 61)
; W[t-2] SHR 6
vpsrlq xmm2, xmm4, 6 ; xmm2 = W[t-2] >> 6
vpxor xmm0, xmm0, xmm2 ; xmm0 = (W[t-2] ROTR 19) ^ (W[t-2] ROTR 61) ^ (W[t-2] SHR 6)
; sigma[0,512](W[t-15]) = (W[t-15] ROTR 1) ^ (W[t-15] ROTR 8) ^ (W[t-15] SHR 7)
%assign idx (%1 - 15)
; W[t-15] ROTR 1
vmovdqu xmm4, W_t(idx) ; xmm4 = W[t-15]|W[t-14]
vpsrlq xmm1, xmm4, 1 ; xmm1 = W[t-15] >> 1
vpsllq xmm2, xmm4, (64-1) ; xmm2 = W[t-15] << 64-1
vpor xmm1, xmm1, xmm2 ; xmm1 = (W[t-15] >> 1) | (W[t-15] << 64-1)
; W[t-15] ROTR 8
vpsrlq xmm3, xmm4, 8 ; xmm3 = W[t-15] >> 8
vpsllq xmm2, xmm4, (64-8) ; xmm2 = W[t-15] << 64-8
vpor xmm3, xmm3, xmm2 ; xmm3 = (W[t-15] >> 8) | (W[t-15] << 64-8)
vpxor xmm1, xmm1, xmm3 ; xmm1 = (W[t-15] ROTR 1) ^ (W[t-15] ROTR 8)
; W[t-15] SHR 7
vpsrlq xmm3, xmm4, 7 ; xmm3 = W[t-15] >> 7
vpxor xmm1, xmm1, xmm3 ; xmm1 = (W[t-15] ROTR 1) ^ (W[t-15] ROTR 8) ^ (W[t-15] SHR 7)
; W[t] = sigma[1,512](W[t-2]) + W[t-7] + sigma[0,512](W[t-15]) + W[t-16]
%assign idx (%1 - 7)
vpaddq xmm0, xmm0, W_t(idx) ; xmm0 = sigma[1,512](W[t-2]) + W[t-7]
vpaddq xmm0, xmm0, xmm1 ; xmm0 = sigma[1,512](W[t-2]) + W[t-7] + sigma[0,512](W[t-15])
%assign idx (%1 - 16)
vpaddq xmm0, xmm0, W_t(idx) ; xmm0 = sigma[1,512](W[t-2]) + W[t-7] + sigma[0,512](W[t-15]) + W[t-16]
%assign idx %1
vmovdqa W_t(idx), xmm0 ; Store W[t]
vpaddq xmm0, xmm0, K_t(idx) ; Compute W[t]+K[t]
vmovdqa WK_2(idx), xmm0 ; Store W[t]+K[t] for next rounds
%endmacro
; #######################################################################
; BOOLEAN TryEnableAccel ()
; To run in QEMU use options: -enable-kvm -cpu Penryn,+avx,+xsave,+xsaveopt
; #######################################################################
align 8
global ASM_PFX(TryEnableAccel)
ASM_PFX(TryEnableAccel):
; Detect CPUID.1:ECX.XSAVE[bit 26] = 1 (CR4.OSXSAVE can be set to 1).
; Detect CPUID.1:ECX.AVX[bit 28] = 1 (AVX instructions supported).
push rbx
mov eax, 1 ; Feature Information
cpuid ; result in EAX, EBX, ECX, EDX
and ecx, 014000000H
cmp ecx, 014000000H ; check both XSAVE and AVX feature flags
jne noAVX
; processor supports AVX instructions
mov rax, cr4
bts rax, 18 ; OSXSAVE: enables XGETBV and XSETBV
mov cr4, rax
xor ecx, ecx ; read the contents of XCR0 register
xgetbv ; result in EDX:EAX
or eax, 06H ; enable both XMM and YMM state support
; XSETBV must be executed at privilege level 0 or in real-address mode.
xsetbv
mov rax, 1
mov byte [rel ASM_PFX(mIsAccelEnabled)], 1
jmp done
noAVX:
xor rax, rax
mov byte [rel ASM_PFX(mIsAccelEnabled)], 0
done:
pop rbx
ret
; #######################################################################
; void Sha512TransformAccel(sha512_state *state, const u8 *data, int blocks)
; Purpose: Updates the SHA512 digest stored at "state" with the message
; stored in "data".
; The size of the message pointed to by "data" must be an integer multiple
; of SHA512 message blocks.
; "blocks" is the message length in SHA512 blocks
; #######################################################################
align 8
global ASM_PFX(Sha512TransformAccel)
ASM_PFX(Sha512TransformAccel):
test msglen, msglen
je nowork
; Allocate Stack Space
mov rax, rsp
pushfq
cli
sub rsp, frame_size
and rsp, ~(0x20 - 1)
mov [rsp + frame_RSPSAVE], rax
; Save GPRs
; Registers RBX, RBP, RDI, RSI, R12, R13, R14, R15 are nonvolatile,
; UEFI does not (officially) support vector registers as a part of the context.
mov [rsp + frame_GPRSAVE], rbx
mov [rsp + frame_GPRSAVE + 8*1], rbp
mov [rsp + frame_GPRSAVE + 8*2], rdi
mov [rsp + frame_GPRSAVE + 8*3], rsi
mov [rsp + frame_GPRSAVE + 8*4], r12
mov [rsp + frame_GPRSAVE + 8*5], r13
mov [rsp + frame_GPRSAVE + 8*6], r14
mov [rsp + frame_GPRSAVE + 8*7], r15
vmovdqu [rsp + frame_GPRSAVE + 8*8], xmm0
vmovdqu [rsp + frame_GPRSAVE + 8*8 + 16*1], xmm1
vmovdqu [rsp + frame_GPRSAVE + 8*8 + 16*2], xmm2
vmovdqu [rsp + frame_GPRSAVE + 8*8 + 16*3], xmm3
vmovdqu [rsp + frame_GPRSAVE + 8*8 + 16*4], xmm4
updateblock:
; Load state variables
mov a_64, DIGEST(0)
mov b_64, DIGEST(1)
mov c_64, DIGEST(2)
mov d_64, DIGEST(3)
mov e_64, DIGEST(4)
mov f_64, DIGEST(5)
mov g_64, DIGEST(6)
mov h_64, DIGEST(7)
%assign t 0
%rep 80/2 + 1
; (80 rounds) / (2 rounds/iteration) + (1 iteration)
; +1 iteration because the scheduler leads hashing by 1 iteration
%if t < 2
; BSWAP 2 QWORDS
vmovdqa xmm1, [rel XMM_QWORD_BSWAP]
vmovdqu xmm0, MSG(t)
vpshufb xmm0, xmm0, xmm1 ; BSWAP
vmovdqa W_t(t), xmm0 ; Store Scheduled Pair
vpaddq xmm0, xmm0, K_t(t) ; Compute W[t]+K[t]
vmovdqa WK_2(t), xmm0 ; Store into WK for rounds
%elif t < 16
; BSWAP 2 QWORDS ; Compute 2 Rounds
vmovdqu xmm0, MSG(t)
vpshufb xmm0, xmm0, xmm1 ; BSWAP
SHA512_Round_Optimized t-2 ; Round t-2
vmovdqa W_t(t), xmm0 ; Store Scheduled Pair
vpaddq xmm0, xmm0, K_t(t) ; Compute W[t]+K[t]
SHA512_Round_Optimized t-1 ; Round t-1
vmovdqa WK_2(t), xmm0 ; Store W[t]+K[t] into WK
%elif t < 79
; Schedule 2 QWORDS ; Compute 2 Rounds
; SHA512_Round_Optimized t-2
; SHA512_Round_Optimized t-1
; SHA512_2Sched t
SHA512_Stitched t
%else
; Compute 2 Rounds
SHA512_Round_Optimized t-2
SHA512_Round_Optimized t-1
%endif
%assign t t+2
%endrep
; Update digest
add DIGEST(0), a_64
add DIGEST(1), b_64
add DIGEST(2), c_64
add DIGEST(3), d_64
add DIGEST(4), e_64
add DIGEST(5), f_64
add DIGEST(6), g_64
add DIGEST(7), h_64
; Advance to next message block
add msg, 16*8
dec msglen
jnz updateblock
; Restore GPRs
mov rbx, [rsp + frame_GPRSAVE]
mov rbp, [rsp + frame_GPRSAVE + 8*1]
mov rdi, [rsp + frame_GPRSAVE + 8*2]
mov rsi, [rsp + frame_GPRSAVE + 8*3]
mov r12, [rsp + frame_GPRSAVE + 8*4]
mov r13, [rsp + frame_GPRSAVE + 8*5]
mov r14, [rsp + frame_GPRSAVE + 8*6]
mov r15, [rsp + frame_GPRSAVE + 8*7]
vmovdqu xmm0, [rsp + frame_GPRSAVE + 8*8]
vmovdqu xmm1, [rsp + frame_GPRSAVE + 8*8 + 16*1]
vmovdqu xmm2, [rsp + frame_GPRSAVE + 8*8 + 16*2]
vmovdqu xmm3, [rsp + frame_GPRSAVE + 8*8 + 16*3]
vmovdqu xmm4, [rsp + frame_GPRSAVE + 8*8 + 16*4]
; Restore Stack Pointer
mov rsp, [rsp + frame_RSPSAVE]
; Reenable the interrupts if they were previously enabled
mov rax, [rsp - 8]
and rax, 200H
cmp rax, 200H
jne nowork
sti
nowork:
ret