The example from our EuroLLVM talk was this:
_Atomic(int) a; a *= b;
This is (according to the spec) equivalent to this (simplified slightly):
int expected = a;
int desired;
do {
desired = expected * b;
} while (!compare_swap_weak(current, expected, desired));
What clang generates is almost this, but with a strong compare and swap:
define void @mul(i32* %a, i32 %b) #0 {
entry:
%atomic-load = load atomic i32* %a seq_cst, align 4, !tbaa !1
br label %atomic_op
atomic_op: ; preds = %atomic_op, %entry
%0 = phi i32 [ %atomic-load, %entry ], [ %1, %atomic_op ]
%mul = mul nsw i32 %0, %b
%1 = cmpxchg i32* %a, i32 %0, i32 %mul seq_cst
%2 = icmp eq i32 %1, %0
br i1 %2, label %atomic_cont, label %atomic_op
atomic_cont: ; preds = %atomic_op
ret void
}
This maps trivially to x86:
LBB0_1:
movl %ecx, %edx
imull %esi, %edx
movl %ecx, %eax
lock
cmpxchgl %edx, (%rdi)
cmpl %ecx, %eax
movl %eax, %ecx
jne LBB0_1
For MIPS, what we *should* be generating is:
sync 0 # Ensure all other loads / stores are globally visible
retry:
ll $t4, $a0 # Load the current value of the atomic int
mult $t4, $a1 # Multiply by the other argument
mflo $t4 # Get the result
sc $t4, $a0 # Try to write it back atomically
bnez $t4, entry # If we failed, try the whole thing again
sync 0 # branch delay slot - ensure seqcst behaviour here
What we actually generate is this:
# BB#0: # %entry
daddiu $sp, $sp, -16
sd $fp, 8($sp) # 8-byte Folded Spill
move $fp, $sp
addiu $3, $zero, 0
$BB0_1: # %entry
# =>This Inner Loop Header: Depth=1
ll $2, 0($4)
bne $2, $3, $BB0_3
nop
# BB#2: # %entry
# in Loop: Header=BB0_1 Depth=1
addiu $6, $zero, 0
sc $6, 0($4)
beqz $6, $BB0_1
nop
$BB0_3: # %entry
sync 0
$BB0_4: # %atomic_op
# =>This Loop Header: Depth=1
# Child Loop BB0_5 Depth 2
move $3, $2
mul $6, $3, $5
sync 0
$BB0_5: # %atomic_op
# Parent Loop BB0_4 Depth=1
# => This Inner Loop Header: Depth=2
ll $2, 0($4)
bne $2, $3, $BB0_7
nop
# BB#6: # %atomic_op
# in Loop: Header=BB0_5 Depth=2
move $7, $6
sc $7, 0($4)
beqz $7, $BB0_5
nop
$BB0_7: # %atomic_op
# in Loop: Header=BB0_4 Depth=1
sync 0
bne $2, $3, $BB0_4
nop
# BB#8: # %atomic_cont
move $sp, $fp
ld $fp, 8($sp) # 8-byte Folded Reload
jr $ra
daddiu $sp, $sp, 16
For correctness, we *have* to implement the cmpxchg in the IR as a ll/sc loop, and so we end up with a nested loop for something that is a single line in the source.
The idiom of the weak compare and exchange loop is a fairly common one, but we generate spectacularly bad code for it.
David