From e6e02924e581bf8a4134b6068a1368b64d19859e Mon Sep 17 00:00:00 2001
From: Theodore Dubois <tblodt@icloud.com>
Date: Thu, 22 Nov 2018 19:56:03 -0800
Subject: [PATCH] Get atomics hopefully working on aarch64

---
 jit/gadgets-aarch64/math.S | 246 +++++++++++++++++++++++++++----------
 jit/gadgets-aarch64/misc.S |  33 +++++
 jit/gen.c                  |  16 ---
 3 files changed, 214 insertions(+), 81 deletions(-)

diff --git a/jit/gadgets-aarch64/math.S b/jit/gadgets-aarch64/math.S
index 8dd13b6c..9bd7cbf3 100644
--- a/jit/gadgets-aarch64/math.S
+++ b/jit/gadgets-aarch64/math.S
@@ -46,51 +46,7 @@
     .endifin
 
     .ifin(\op, add,sub,adc,sbc)
-        # setting flags: a horror story
-        .ifb \s
-            # for 32-bit operands, we can just do the operation and the chip
-            # will set v and c right, which we copy
-            \op\()s _tmp, _tmp, \arg
-            cset w10, vs
-            strb w10, [_cpu, CPU_of]
-            .ifin(\op, add,adc)
-                cset w10, cs
-            .endifin
-            .ifin(\op, sub,sbc)
-                cset w10, cc
-            .endifin
-            strb w10, [_cpu, CPU_cf]
-        .else
-            # for 16 or 8 bit operands...
-            # first figure out unsigned overflow
-            uxt\s w10, _tmp
-            .ifin(\op, add,sub)
-                \op w10, w10, \arg, uxt\s
-            .endifin
-            .ifin(\op, adc,sbc)
-                uxt\s w9, \arg
-                \op w10, w10, w9
-            .endifin
-            .ifc \s,b
-                lsr w10, w10, 8
-            .else
-                lsr w10, w10, 16
-            .endif
-            strb w10, [_cpu, CPU_cf]
-            # now signed overflow
-            sxt\s w10, _tmp
-            .ifin(\op, add,sub)
-                \op _tmp, w10, \arg, sxt\s
-            .endifin
-            .ifin(\op, adc,sbc)
-                # help me
-                sxt\s w9, \arg
-                \op _tmp, w10, w9
-            .endifin
-            cmp _tmp, _tmp, sxt\s
-            cset w10, ne
-            strb w10, [_cpu, CPU_of]
-        .endif
+        do_add \op, _tmp, \arg, \s
     .endifin
 
     .ifc \op,imul
@@ -154,6 +110,54 @@
     ss \size, _do_op, \op, \arg
 .endm
 
+.macro do_add op, dst, src, s
+    # setting flags: a horror story
+    .ifb \s
+        # for 32-bit operands, we can just do the operation and the chip
+        # will set v and c right, which we copy
+        \op\()s \dst, \dst, \src
+        cset w10, vs
+        strb w10, [_cpu, CPU_of]
+        .ifin(\op, add,adc)
+            cset w10, cs
+        .endifin
+        .ifin(\op, sub,sbc)
+            cset w10, cc
+        .endifin
+        strb w10, [_cpu, CPU_cf]
+    .else
+        # for 16 or 8 bit operands...
+        # first figure out unsigned overflow
+        uxt\s w10, \dst
+        .ifin(\op, add,sub)
+            \op w10, w10, \src, uxt\s
+        .endifin
+        .ifin(\op, adc,sbc)
+            uxt\s w9, \src
+            \op w10, w10, w9
+        .endifin
+        .ifc \s,b
+            lsr w10, w10, 8
+        .else
+            lsr w10, w10, 16
+        .endif
+        strb w10, [_cpu, CPU_cf]
+        # now signed overflow
+        sxt\s w10, \dst
+        .ifin(\op, add,sub)
+            \op \dst, w10, \src, sxt\s
+        .endifin
+        .ifin(\op, adc,sbc)
+            # help me
+            sxt\s w9, \src
+            \op \dst, w10, w9
+        .endifin
+        cmp \dst, \dst, sxt\s
+        cset w10, ne
+        strb w10, [_cpu, CPU_of]
+    .endif
+.endm
+
 .macro do_reg_op op, armop, size, reg
     .gadget \op\size\()_reg_\reg
         do_op \armop, \size, e\reg\()x
@@ -174,28 +178,38 @@
             gret 1
     .endif
 
-    .gadget \op\size\()_mem
-        .ifc \op,store
+    .ifnc \op,xchg
+        .gadget \op\size\()_mem
+            .ifc \op,store
+                write_prep \size, \op\size\()_mem
+            .else
+                read_prep \size, \op\size\()_mem
+            .endif
+            ldr\s w8, [_xaddr]
+            do_op \armop, \size, w8
+            .ifc \op,store
+                str\s w8, [_xaddr]
+                write_done \size, \op\size\()_mem
+            .endif
+            gret 1
+            .ifc \op,store
+                write_bullshit \size, \op\size\()_mem
+            .else
+                read_bullshit \size, \op\size\()_mem
+            .endif
+    .else
+        # xchg must be atomic
+        .gadget \op\size\()_mem
             write_prep \size, \op\size\()_mem
-        .else N .ifc \op,xchg
-            write_prep \size, \op\size\()_mem
-        .else
-            read_prep \size, \op\size\()_mem
-        .endif N .endif
-        ldr\s w8, [_xaddr]
-        do_op \armop, \size, w8
-        .ifin(\op, store,xchg)
-            str\s w8, [_xaddr]
+        1:
+            ldaxr\s w8, [_xaddr]
+            stlxr\s w10, _tmp, [_xaddr]
+            cbnz w10, 1b
+            movs _tmp, w8
             write_done \size, \op\size\()_mem
-        .endifin
-        gret 1
-        .ifc \op,store
+            gret 1
             write_bullshit \size, \op\size\()_mem
-        .else N .ifc \op,xchg
-            write_bullshit \size, \op\size\()_mem
-        .else
-            read_bullshit \size, \op\size\()_mem
-        .endif N .endif
+    .endif
 
     .irp reg, a,b,c,d
         do_reg_op \op, \armop, \size, \reg
@@ -214,6 +228,7 @@
             .endif
             gret
     .endr
+
 .endm
 
 .irp op, load,store,xchg,add,sub,adc,sbb,and,or,xor
@@ -238,9 +253,110 @@
     .gadget_array \op
 .endr
 
+# atomics. oof
+
+.macro do_op_size_atomic op, armop, size, s
+    .gadget atomic_\op\size\()_mem
+        # There's so much stuff going on inside most of these operations that
+        # the implementation is a compare-and-swap loop, instead of just ldaxr/stlxr
+        write_prep \size, atomic_\op\size\()_mem
+        ldr\s w12, [_xaddr]
+    1:
+        mov w8, w12
+
+        # do the operation
+        # dest = w8, src = _tmp
+        .ifin(\op, add,sub,adc,sbc)
+            setf_a src=_tmp, dst=w8
+        .endifin
+        .ifin(\op, and,orr,eor)
+            clearf_a
+            clearf_oc
+        .endifin
+        .ifin(\op, adc,sbc)
+            ldrb w10, [_cpu, CPU_cf]
+            .ifc \op,adc
+                cmp w10, 1
+            .else
+                mvn w10, w10
+                cmn w10, 1
+            .endif
+        .endifin
+
+        .ifin(\op, and,orr,eor)
+            \op w8, w8, _tmp
+        .endifin
+        .ifin(\op, add,sub,adc,sbc)
+            do_add \op, w8, _tmp, \s
+        .endifin
+        .ifc \op,xadd
+            # exchange, then add
+            mov w9, _tmp
+            mov _tmp, w8
+            mov w8, w9
+            do_add add, w8, _tmp, \s
+        .endif
+
+        .ifin(\op, add,sub,adc,sbc,and,orr,eor,xadd)
+            setf_zsp \s, val=w8
+        .endifin
+
+        .ifin(\op, inc,dec)
+            mov w10, 1
+            setf_a src=w10, dst=w8
+            .ifb \s
+                .ifc \op,inc
+                    adds w8, w8, 1
+                .else
+                    subs w8, w8, 1
+                .endif
+                cset w9, vs
+            .else
+                sxt\s w8, w8
+                .ifc \op,inc
+                    adds w8, w8, 1
+                .else
+                    subs w8, w8, 1
+                .endif
+                cmp w8, w8, sxt\s
+                cset w9, ne
+            .endif
+            strb w9, [_cpu, CPU_of]
+            setf_zsp \s
+        .endifin
+
+    2:
+        ldaxr\s w13, [_xaddr]
+        cmp w12, w13
+        b.ne 3f
+        stlxr\s w13, w8, [_xaddr]
+        cbnz w13, 2b
+        write_done \size, atomic_\op\size\()_mem
+        gret 1
+        write_bullshit \size, atomic_\op\size\()_mem
+    3:
+        dmb ish
+        mov w12, w13
+        b 1b
+.endm
+
+.irp op, add,sub,adc,sbb,and,or,xor,inc,dec,xadd
+    .irp size, SIZE_LIST
+        .ifc \op,xor
+            ss \size, do_op_size_atomic, \op, eor
+        .else N .ifc \op,sbb
+            ss \size, do_op_size_atomic, \op, sbc
+        .else N .ifc \op,or
+            ss \size, do_op_size_atomic, \op, orr
+        .else
+            ss \size, do_op_size_atomic, \op, \op
+        .endif N .endif N .endif
+    .endr
+    .gadget_array atomic_\op
+.endr
+
 # unary operations (well, only one explicit operand)
 
-# TODO OF (not CF)
 .macro do_inc size, s
     mov w10, 1
     setf_a w10, _tmp
diff --git a/jit/gadgets-aarch64/misc.S b/jit/gadgets-aarch64/misc.S
index d4ea1b86..6e1f3ea3 100644
--- a/jit/gadgets-aarch64/misc.S
+++ b/jit/gadgets-aarch64/misc.S
@@ -37,6 +37,39 @@
     write_bullshit 32, cmpxchg32_mem
 .gadget_array cmpxchg
 
+.gadget atomic_cmpxchg32_mem
+    write_prep 32, atomic_cmpxchg32_mem
+    mov w12, eax
+    ldr w11, [_xaddr]
+1:
+    mov w8, w11
+    subs w9, eax, w8
+    setf_zsp val=w9
+    setf_a eax, w8
+    setf_oc
+    csel eax, w8, eax, ne
+    csel w8, _tmp, w8, eq
+    cset w9, eq
+
+    # all that setf stuff writes to memory which means instead of just using
+    # ldaxr and stlxr we now have to do *another* compare-and-exchange
+2:
+    ldaxr w10, [_xaddr]
+    cmp w10, w11
+    b.ne 3f
+    stlxr w10, w8, [_xaddr]
+    cbnz w10, 2b
+
+    write_done 32, atomic_cmpxchg32_mem
+    gret 1
+    write_bullshit 32, atomic_cmpxchg32_mem
+3:
+    dmb ish
+    mov w11, w10
+    mov eax, w12
+    b 1b
+.gadget_array atomic_cmpxchg
+
 .macro do_helper type, size=
     .gadget helper_\type\size
         .ifin(\type, read,write)
diff --git a/jit/gen.c b/jit/gen.c
index fa369e9a..7217b753 100644
--- a/jit/gen.c
+++ b/jit/gen.c
@@ -320,8 +320,6 @@ void helper_rdtsc(struct cpu_state *cpu);
 #define CPUID() g(cpuid)
 
 // atomic
-// TODO the gadgets currently don't exist on arm
-#if defined(__x86_64__)
 #define atomic_op(type, src, dst,z) load(src, z); op(atomic_##type, dst, z)
 #define ATOMIC_ADD(src, dst,z) atomic_op(add, src, dst, z)
 #define ATOMIC_OR(src, dst,z) atomic_op(or, src, dst, z)
@@ -335,20 +333,6 @@ void helper_rdtsc(struct cpu_state *cpu);
 #define ATOMIC_CMPXCHG(src, dst,z) atomic_op(cmpxchg, src, dst, z)
 #define ATOMIC_XADD(src, dst,z) load(src, z); op(atomic_xadd, dst, z); store(src, z)
 
-#else
-#define ATOMIC_ADD ADD
-#define ATOMIC_OR OR
-#define ATOMIC_ADC ADC
-#define ATOMIC_SBB SBB
-#define ATOMIC_AND AND
-#define ATOMIC_SUB SUB
-#define ATOMIC_XOR XOR
-#define ATOMIC_INC INC
-#define ATOMIC_DEC DEC
-#define ATOMIC_CMPXCHG CMPXCHG
-#define ATOMIC_XADD XADD
-#endif
-
 // sse
 #define XORP(src, dst) UNDEFINED
 #define PSRLQ(src, dst) UNDEFINED