From e6e02924e581bf8a4134b6068a1368b64d19859e Mon Sep 17 00:00:00 2001 From: Theodore Dubois Date: Thu, 22 Nov 2018 19:56:03 -0800 Subject: [PATCH] Get atomics hopefully working on aarch64 --- jit/gadgets-aarch64/math.S | 246 +++++++++++++++++++++++++++---------- jit/gadgets-aarch64/misc.S | 33 +++++ jit/gen.c | 16 --- 3 files changed, 214 insertions(+), 81 deletions(-) diff --git a/jit/gadgets-aarch64/math.S b/jit/gadgets-aarch64/math.S index 8dd13b6c..9bd7cbf3 100644 --- a/jit/gadgets-aarch64/math.S +++ b/jit/gadgets-aarch64/math.S @@ -46,51 +46,7 @@ .endifin .ifin(\op, add,sub,adc,sbc) - # setting flags: a horror story - .ifb \s - # for 32-bit operands, we can just do the operation and the chip - # will set v and c right, which we copy - \op\()s _tmp, _tmp, \arg - cset w10, vs - strb w10, [_cpu, CPU_of] - .ifin(\op, add,adc) - cset w10, cs - .endifin - .ifin(\op, sub,sbc) - cset w10, cc - .endifin - strb w10, [_cpu, CPU_cf] - .else - # for 16 or 8 bit operands... - # first figure out unsigned overflow - uxt\s w10, _tmp - .ifin(\op, add,sub) - \op w10, w10, \arg, uxt\s - .endifin - .ifin(\op, adc,sbc) - uxt\s w9, \arg - \op w10, w10, w9 - .endifin - .ifc \s,b - lsr w10, w10, 8 - .else - lsr w10, w10, 16 - .endif - strb w10, [_cpu, CPU_cf] - # now signed overflow - sxt\s w10, _tmp - .ifin(\op, add,sub) - \op _tmp, w10, \arg, sxt\s - .endifin - .ifin(\op, adc,sbc) - # help me - sxt\s w9, \arg - \op _tmp, w10, w9 - .endifin - cmp _tmp, _tmp, sxt\s - cset w10, ne - strb w10, [_cpu, CPU_of] - .endif + do_add \op, _tmp, \arg, \s .endifin .ifc \op,imul @@ -154,6 +110,54 @@ ss \size, _do_op, \op, \arg .endm +.macro do_add op, dst, src, s + # setting flags: a horror story + .ifb \s + # for 32-bit operands, we can just do the operation and the chip + # will set v and c right, which we copy + \op\()s \dst, \dst, \src + cset w10, vs + strb w10, [_cpu, CPU_of] + .ifin(\op, add,adc) + cset w10, cs + .endifin + .ifin(\op, sub,sbc) + cset w10, cc + .endifin + strb w10, [_cpu, CPU_cf] + .else + # for 16 or 8 bit operands... + # first figure out unsigned overflow + uxt\s w10, \dst + .ifin(\op, add,sub) + \op w10, w10, \src, uxt\s + .endifin + .ifin(\op, adc,sbc) + uxt\s w9, \src + \op w10, w10, w9 + .endifin + .ifc \s,b + lsr w10, w10, 8 + .else + lsr w10, w10, 16 + .endif + strb w10, [_cpu, CPU_cf] + # now signed overflow + sxt\s w10, \dst + .ifin(\op, add,sub) + \op \dst, w10, \src, sxt\s + .endifin + .ifin(\op, adc,sbc) + # help me + sxt\s w9, \src + \op \dst, w10, w9 + .endifin + cmp \dst, \dst, sxt\s + cset w10, ne + strb w10, [_cpu, CPU_of] + .endif +.endm + .macro do_reg_op op, armop, size, reg .gadget \op\size\()_reg_\reg do_op \armop, \size, e\reg\()x @@ -174,28 +178,38 @@ gret 1 .endif - .gadget \op\size\()_mem - .ifc \op,store + .ifnc \op,xchg + .gadget \op\size\()_mem + .ifc \op,store + write_prep \size, \op\size\()_mem + .else + read_prep \size, \op\size\()_mem + .endif + ldr\s w8, [_xaddr] + do_op \armop, \size, w8 + .ifc \op,store + str\s w8, [_xaddr] + write_done \size, \op\size\()_mem + .endif + gret 1 + .ifc \op,store + write_bullshit \size, \op\size\()_mem + .else + read_bullshit \size, \op\size\()_mem + .endif + .else + # xchg must be atomic + .gadget \op\size\()_mem write_prep \size, \op\size\()_mem - .else N .ifc \op,xchg - write_prep \size, \op\size\()_mem - .else - read_prep \size, \op\size\()_mem - .endif N .endif - ldr\s w8, [_xaddr] - do_op \armop, \size, w8 - .ifin(\op, store,xchg) - str\s w8, [_xaddr] + 1: + ldaxr\s w8, [_xaddr] + stlxr\s w10, _tmp, [_xaddr] + cbnz w10, 1b + movs _tmp, w8 write_done \size, \op\size\()_mem - .endifin - gret 1 - .ifc \op,store + gret 1 write_bullshit \size, \op\size\()_mem - .else N .ifc \op,xchg - write_bullshit \size, \op\size\()_mem - .else - read_bullshit \size, \op\size\()_mem - .endif N .endif + .endif .irp reg, a,b,c,d do_reg_op \op, \armop, \size, \reg @@ -214,6 +228,7 @@ .endif gret .endr + .endm .irp op, load,store,xchg,add,sub,adc,sbb,and,or,xor @@ -238,9 +253,110 @@ .gadget_array \op .endr +# atomics. oof + +.macro do_op_size_atomic op, armop, size, s + .gadget atomic_\op\size\()_mem + # There's so much stuff going on inside most of these operations that + # the implementation is a compare-and-swap loop, instead of just ldaxr/stlxr + write_prep \size, atomic_\op\size\()_mem + ldr\s w12, [_xaddr] + 1: + mov w8, w12 + + # do the operation + # dest = w8, src = _tmp + .ifin(\op, add,sub,adc,sbc) + setf_a src=_tmp, dst=w8 + .endifin + .ifin(\op, and,orr,eor) + clearf_a + clearf_oc + .endifin + .ifin(\op, adc,sbc) + ldrb w10, [_cpu, CPU_cf] + .ifc \op,adc + cmp w10, 1 + .else + mvn w10, w10 + cmn w10, 1 + .endif + .endifin + + .ifin(\op, and,orr,eor) + \op w8, w8, _tmp + .endifin + .ifin(\op, add,sub,adc,sbc) + do_add \op, w8, _tmp, \s + .endifin + .ifc \op,xadd + # exchange, then add + mov w9, _tmp + mov _tmp, w8 + mov w8, w9 + do_add add, w8, _tmp, \s + .endif + + .ifin(\op, add,sub,adc,sbc,and,orr,eor,xadd) + setf_zsp \s, val=w8 + .endifin + + .ifin(\op, inc,dec) + mov w10, 1 + setf_a src=w10, dst=w8 + .ifb \s + .ifc \op,inc + adds w8, w8, 1 + .else + subs w8, w8, 1 + .endif + cset w9, vs + .else + sxt\s w8, w8 + .ifc \op,inc + adds w8, w8, 1 + .else + subs w8, w8, 1 + .endif + cmp w8, w8, sxt\s + cset w9, ne + .endif + strb w9, [_cpu, CPU_of] + setf_zsp \s + .endifin + + 2: + ldaxr\s w13, [_xaddr] + cmp w12, w13 + b.ne 3f + stlxr\s w13, w8, [_xaddr] + cbnz w13, 2b + write_done \size, atomic_\op\size\()_mem + gret 1 + write_bullshit \size, atomic_\op\size\()_mem + 3: + dmb ish + mov w12, w13 + b 1b +.endm + +.irp op, add,sub,adc,sbb,and,or,xor,inc,dec,xadd + .irp size, SIZE_LIST + .ifc \op,xor + ss \size, do_op_size_atomic, \op, eor + .else N .ifc \op,sbb + ss \size, do_op_size_atomic, \op, sbc + .else N .ifc \op,or + ss \size, do_op_size_atomic, \op, orr + .else + ss \size, do_op_size_atomic, \op, \op + .endif N .endif N .endif + .endr + .gadget_array atomic_\op +.endr + # unary operations (well, only one explicit operand) -# TODO OF (not CF) .macro do_inc size, s mov w10, 1 setf_a w10, _tmp diff --git a/jit/gadgets-aarch64/misc.S b/jit/gadgets-aarch64/misc.S index d4ea1b86..6e1f3ea3 100644 --- a/jit/gadgets-aarch64/misc.S +++ b/jit/gadgets-aarch64/misc.S @@ -37,6 +37,39 @@ write_bullshit 32, cmpxchg32_mem .gadget_array cmpxchg +.gadget atomic_cmpxchg32_mem + write_prep 32, atomic_cmpxchg32_mem + mov w12, eax + ldr w11, [_xaddr] +1: + mov w8, w11 + subs w9, eax, w8 + setf_zsp val=w9 + setf_a eax, w8 + setf_oc + csel eax, w8, eax, ne + csel w8, _tmp, w8, eq + cset w9, eq + + # all that setf stuff writes to memory which means instead of just using + # ldaxr and stlxr we now have to do *another* compare-and-exchange +2: + ldaxr w10, [_xaddr] + cmp w10, w11 + b.ne 3f + stlxr w10, w8, [_xaddr] + cbnz w10, 2b + + write_done 32, atomic_cmpxchg32_mem + gret 1 + write_bullshit 32, atomic_cmpxchg32_mem +3: + dmb ish + mov w11, w10 + mov eax, w12 + b 1b +.gadget_array atomic_cmpxchg + .macro do_helper type, size= .gadget helper_\type\size .ifin(\type, read,write) diff --git a/jit/gen.c b/jit/gen.c index fa369e9a..7217b753 100644 --- a/jit/gen.c +++ b/jit/gen.c @@ -320,8 +320,6 @@ void helper_rdtsc(struct cpu_state *cpu); #define CPUID() g(cpuid) // atomic -// TODO the gadgets currently don't exist on arm -#if defined(__x86_64__) #define atomic_op(type, src, dst,z) load(src, z); op(atomic_##type, dst, z) #define ATOMIC_ADD(src, dst,z) atomic_op(add, src, dst, z) #define ATOMIC_OR(src, dst,z) atomic_op(or, src, dst, z) @@ -335,20 +333,6 @@ void helper_rdtsc(struct cpu_state *cpu); #define ATOMIC_CMPXCHG(src, dst,z) atomic_op(cmpxchg, src, dst, z) #define ATOMIC_XADD(src, dst,z) load(src, z); op(atomic_xadd, dst, z); store(src, z) -#else -#define ATOMIC_ADD ADD -#define ATOMIC_OR OR -#define ATOMIC_ADC ADC -#define ATOMIC_SBB SBB -#define ATOMIC_AND AND -#define ATOMIC_SUB SUB -#define ATOMIC_XOR XOR -#define ATOMIC_INC INC -#define ATOMIC_DEC DEC -#define ATOMIC_CMPXCHG CMPXCHG -#define ATOMIC_XADD XADD -#endif - // sse #define XORP(src, dst) UNDEFINED #define PSRLQ(src, dst) UNDEFINED