<https://compiler-rt.llvm.org/index.html> boasts:
The builtins library provides optimized implementations of this
and other low-level routines, either in target-independent C form,
or as a heavily-optimized assembly.
Really?
Left: inperformant code shipped in # Right: slightly improved code,
clang_rt.builtins-* # which the optimiser REALLY
# should have generated
___cmpdi2:
mov ecx, [esp+16] # mov ecx, [esp+16]
xor eax, eax # xor eax, eax
cmp [esp+8], ecx # cmp ecx, [esp+8]
jl @f # jg @f
mov eax, 2 # mov eax, 2
jg @f # jl @f
mov ecx, [esp+4] #
mov edx, [esp+12] # mov ecx, [esp+12]
mov eax, 0 # xor eax, eax
cmp ecx, edx # cmp ecx, [esp+4]
jb @f # ja @f
cmp edx, ecx #
mov eax, 1 #
adc eax, 0 # adc eax, 1
@@: # @@:
ret # ret
# 3 instructions less, 10 bytes saved
___ucmpdi2:
mov ecx, [esp+16] # mov ecx, [esp+16]
xor eax, eax # xor eax, eax
cmp [esp+8], ecx # cmp ecx, [esp+8]
jb @f # ja @f
mov eax, 2 # mov eax, 2
ja @f # jb @f
mov ecx, [esp+4] #
mov edx, [esp+12] # mov ecx, [esp+12]
mov eax, 0 # xor eax, eax
cmp ecx, edx # cmp ecx, [esp+4]
jb @f # ja @f
cmp edx, ecx #
mov eax, 1 #
adc eax, 0 # adc eax, 1
@@: # @@:
ret # ret
# 3 instructions less, 10 bytes saved
Now properly written code, of course branch-free, faster and shorter: