Get atomics hopefully working on aarch64

This commit is contained in:
Theodore Dubois 2018-11-22 19:56:03 -08:00
parent d0c868f83e
commit e6e02924e5
3 changed files with 214 additions and 81 deletions

View File

@ -46,51 +46,7 @@
.endifin
.ifin(\op, add,sub,adc,sbc)
# setting flags: a horror story
.ifb \s
# for 32-bit operands, we can just do the operation and the chip
# will set v and c right, which we copy
\op\()s _tmp, _tmp, \arg
cset w10, vs
strb w10, [_cpu, CPU_of]
.ifin(\op, add,adc)
cset w10, cs
.endifin
.ifin(\op, sub,sbc)
cset w10, cc
.endifin
strb w10, [_cpu, CPU_cf]
.else
# for 16 or 8 bit operands...
# first figure out unsigned overflow
uxt\s w10, _tmp
.ifin(\op, add,sub)
\op w10, w10, \arg, uxt\s
.endifin
.ifin(\op, adc,sbc)
uxt\s w9, \arg
\op w10, w10, w9
.endifin
.ifc \s,b
lsr w10, w10, 8
.else
lsr w10, w10, 16
.endif
strb w10, [_cpu, CPU_cf]
# now signed overflow
sxt\s w10, _tmp
.ifin(\op, add,sub)
\op _tmp, w10, \arg, sxt\s
.endifin
.ifin(\op, adc,sbc)
# help me
sxt\s w9, \arg
\op _tmp, w10, w9
.endifin
cmp _tmp, _tmp, sxt\s
cset w10, ne
strb w10, [_cpu, CPU_of]
.endif
do_add \op, _tmp, \arg, \s
.endifin
.ifc \op,imul
@ -154,6 +110,54 @@
ss \size, _do_op, \op, \arg
.endm
.macro do_add op, dst, src, s
# setting flags: a horror story
.ifb \s
# for 32-bit operands, we can just do the operation and the chip
# will set v and c right, which we copy
\op\()s \dst, \dst, \src
cset w10, vs
strb w10, [_cpu, CPU_of]
.ifin(\op, add,adc)
cset w10, cs
.endifin
.ifin(\op, sub,sbc)
cset w10, cc
.endifin
strb w10, [_cpu, CPU_cf]
.else
# for 16 or 8 bit operands...
# first figure out unsigned overflow
uxt\s w10, \dst
.ifin(\op, add,sub)
\op w10, w10, \src, uxt\s
.endifin
.ifin(\op, adc,sbc)
uxt\s w9, \src
\op w10, w10, w9
.endifin
.ifc \s,b
lsr w10, w10, 8
.else
lsr w10, w10, 16
.endif
strb w10, [_cpu, CPU_cf]
# now signed overflow
sxt\s w10, \dst
.ifin(\op, add,sub)
\op \dst, w10, \src, sxt\s
.endifin
.ifin(\op, adc,sbc)
# help me
sxt\s w9, \src
\op \dst, w10, w9
.endifin
cmp \dst, \dst, sxt\s
cset w10, ne
strb w10, [_cpu, CPU_of]
.endif
.endm
.macro do_reg_op op, armop, size, reg
.gadget \op\size\()_reg_\reg
do_op \armop, \size, e\reg\()x
@ -174,28 +178,38 @@
gret 1
.endif
.gadget \op\size\()_mem
.ifc \op,store
.ifnc \op,xchg
.gadget \op\size\()_mem
.ifc \op,store
write_prep \size, \op\size\()_mem
.else
read_prep \size, \op\size\()_mem
.endif
ldr\s w8, [_xaddr]
do_op \armop, \size, w8
.ifc \op,store
str\s w8, [_xaddr]
write_done \size, \op\size\()_mem
.endif
gret 1
.ifc \op,store
write_bullshit \size, \op\size\()_mem
.else
read_bullshit \size, \op\size\()_mem
.endif
.else
# xchg must be atomic
.gadget \op\size\()_mem
write_prep \size, \op\size\()_mem
.else N .ifc \op,xchg
write_prep \size, \op\size\()_mem
.else
read_prep \size, \op\size\()_mem
.endif N .endif
ldr\s w8, [_xaddr]
do_op \armop, \size, w8
.ifin(\op, store,xchg)
str\s w8, [_xaddr]
1:
ldaxr\s w8, [_xaddr]
stlxr\s w10, _tmp, [_xaddr]
cbnz w10, 1b
movs _tmp, w8
write_done \size, \op\size\()_mem
.endifin
gret 1
.ifc \op,store
gret 1
write_bullshit \size, \op\size\()_mem
.else N .ifc \op,xchg
write_bullshit \size, \op\size\()_mem
.else
read_bullshit \size, \op\size\()_mem
.endif N .endif
.endif
.irp reg, a,b,c,d
do_reg_op \op, \armop, \size, \reg
@ -214,6 +228,7 @@
.endif
gret
.endr
.endm
.irp op, load,store,xchg,add,sub,adc,sbb,and,or,xor
@ -238,9 +253,110 @@
.gadget_array \op
.endr
# atomics. oof
.macro do_op_size_atomic op, armop, size, s
.gadget atomic_\op\size\()_mem
# There's so much stuff going on inside most of these operations that
# the implementation is a compare-and-swap loop, instead of just ldaxr/stlxr
write_prep \size, atomic_\op\size\()_mem
ldr\s w12, [_xaddr]
1:
mov w8, w12
# do the operation
# dest = w8, src = _tmp
.ifin(\op, add,sub,adc,sbc)
setf_a src=_tmp, dst=w8
.endifin
.ifin(\op, and,orr,eor)
clearf_a
clearf_oc
.endifin
.ifin(\op, adc,sbc)
ldrb w10, [_cpu, CPU_cf]
.ifc \op,adc
cmp w10, 1
.else
mvn w10, w10
cmn w10, 1
.endif
.endifin
.ifin(\op, and,orr,eor)
\op w8, w8, _tmp
.endifin
.ifin(\op, add,sub,adc,sbc)
do_add \op, w8, _tmp, \s
.endifin
.ifc \op,xadd
# exchange, then add
mov w9, _tmp
mov _tmp, w8
mov w8, w9
do_add add, w8, _tmp, \s
.endif
.ifin(\op, add,sub,adc,sbc,and,orr,eor,xadd)
setf_zsp \s, val=w8
.endifin
.ifin(\op, inc,dec)
mov w10, 1
setf_a src=w10, dst=w8
.ifb \s
.ifc \op,inc
adds w8, w8, 1
.else
subs w8, w8, 1
.endif
cset w9, vs
.else
sxt\s w8, w8
.ifc \op,inc
adds w8, w8, 1
.else
subs w8, w8, 1
.endif
cmp w8, w8, sxt\s
cset w9, ne
.endif
strb w9, [_cpu, CPU_of]
setf_zsp \s
.endifin
2:
ldaxr\s w13, [_xaddr]
cmp w12, w13
b.ne 3f
stlxr\s w13, w8, [_xaddr]
cbnz w13, 2b
write_done \size, atomic_\op\size\()_mem
gret 1
write_bullshit \size, atomic_\op\size\()_mem
3:
dmb ish
mov w12, w13
b 1b
.endm
.irp op, add,sub,adc,sbb,and,or,xor,inc,dec,xadd
.irp size, SIZE_LIST
.ifc \op,xor
ss \size, do_op_size_atomic, \op, eor
.else N .ifc \op,sbb
ss \size, do_op_size_atomic, \op, sbc
.else N .ifc \op,or
ss \size, do_op_size_atomic, \op, orr
.else
ss \size, do_op_size_atomic, \op, \op
.endif N .endif N .endif
.endr
.gadget_array atomic_\op
.endr
# unary operations (well, only one explicit operand)
# TODO OF (not CF)
.macro do_inc size, s
mov w10, 1
setf_a w10, _tmp

View File

@ -37,6 +37,39 @@
write_bullshit 32, cmpxchg32_mem
.gadget_array cmpxchg
.gadget atomic_cmpxchg32_mem
write_prep 32, atomic_cmpxchg32_mem
mov w12, eax
ldr w11, [_xaddr]
1:
mov w8, w11
subs w9, eax, w8
setf_zsp val=w9
setf_a eax, w8
setf_oc
csel eax, w8, eax, ne
csel w8, _tmp, w8, eq
cset w9, eq
# all that setf stuff writes to memory which means instead of just using
# ldaxr and stlxr we now have to do *another* compare-and-exchange
2:
ldaxr w10, [_xaddr]
cmp w10, w11
b.ne 3f
stlxr w10, w8, [_xaddr]
cbnz w10, 2b
write_done 32, atomic_cmpxchg32_mem
gret 1
write_bullshit 32, atomic_cmpxchg32_mem
3:
dmb ish
mov w11, w10
mov eax, w12
b 1b
.gadget_array atomic_cmpxchg
.macro do_helper type, size=
.gadget helper_\type\size
.ifin(\type, read,write)

View File

@ -320,8 +320,6 @@ void helper_rdtsc(struct cpu_state *cpu);
#define CPUID() g(cpuid)
// atomic
// TODO the gadgets currently don't exist on arm
#if defined(__x86_64__)
#define atomic_op(type, src, dst,z) load(src, z); op(atomic_##type, dst, z)
#define ATOMIC_ADD(src, dst,z) atomic_op(add, src, dst, z)
#define ATOMIC_OR(src, dst,z) atomic_op(or, src, dst, z)
@ -335,20 +333,6 @@ void helper_rdtsc(struct cpu_state *cpu);
#define ATOMIC_CMPXCHG(src, dst,z) atomic_op(cmpxchg, src, dst, z)
#define ATOMIC_XADD(src, dst,z) load(src, z); op(atomic_xadd, dst, z); store(src, z)
#else
#define ATOMIC_ADD ADD
#define ATOMIC_OR OR
#define ATOMIC_ADC ADC
#define ATOMIC_SBB SBB
#define ATOMIC_AND AND
#define ATOMIC_SUB SUB
#define ATOMIC_XOR XOR
#define ATOMIC_INC INC
#define ATOMIC_DEC DEC
#define ATOMIC_CMPXCHG CMPXCHG
#define ATOMIC_XADD XADD
#endif
// sse
#define XORP(src, dst) UNDEFINED
#define PSRLQ(src, dst) UNDEFINED