Where's the optimiser gone (part 10): sptting a cookie

Compile with -O3 -m32, or generate an assembly listing of __divdi3
and __moddi3 as shipped in clang_rt.builtins-i386.lib

unsigned long long __udivmoddi4(unsigned long long numerator,
                                unsigned long long denominator,
                                unsigned long long *remainder);

long long __moddi3(long long dividend, long long divisor)
{
    long long r = divisor >> 63; // r = divisor < 0 ? -1 : 0
    long long s = dividend >> 63; // s = dividend < 0 ? -1 : 0
    divisor = (divisor ^ r) - r; // negate if divisor < 0
    dividend = (dividend ^ s) - s; // negate if dividend < 0
    __udivmoddi4(dividend, divisor, (unsigned long long *) &r);
    return (r ^ s) - s; // negate if dividend < 0
}

___moddi3:
00: 55 push ebp |
01: 89 E5 mov ebp, esp |
03: 53 push ebx | push ebx
04: 57 push edi |
05: 56 push esi |
06: 83 E4 F8 and esp, 0FFFFFFF8h |
09: 83 EC 10 sub esp, 10h | sub esp, 8
0C: 8B 45 14 mov eax, [ebp+14h] | mov eax, [esp+28]
0F: 8B 55 10 mov edx, [ebp+10h] | mov ecx, [esp+24]
12: 8B 35 00 00 00 00 mov esi, [___security_cookie] |
18: 89 E7 mov edi, esp | push esp
1A: 89 C1 mov ecx, eax |
1C: C1 F9 1F sar ecx, 1Fh | cdq
1F: 01 CA add edx, ecx | xor ecx, edx
21: 11 C8 adc eax, ecx | xor eax, edx
23: 31 CA xor edx, ecx | sub ecx, edx
25: 31 EE xor esi, ebp |
27: 31 C8 xor eax, ecx | sbb eax, edx
                                                         > push eax
                                                         > push ecx
29: 8B 4D 0C mov ecx, [ebp+0Ch] | mov eax, [esp+32]
2C: 89 74 24 08 mov [esp+8],esi |
30: 8B 75 08 mov esi, [ebp+8] | mov ecx, [esp+28]
33: 89 CB mov ebx, ecx | cdq
35: C1 FB 1F sar ebx, 1Fh | mov ebx, edx
38: 31 DE xor esi, ebx | xor ecx, edx
3A: 31 D9 xor ecx, ebx | xor eax, edx
3C: 29 DE sub esi, ebx | sub ecx, edx
3E: 19 D9 sbb ecx, ebx | sbb eax, edx
40: 57 push edi |
41: 50 push eax | push eax
42: 52 push edx |
43: 51 push ecx | push ecx
44: 56 push esi |
45: E8 00 00 00 00 call ___udivmoddi4 | call ___udivmoddi4
4A: 83 C4 14 add esp, 14h | add esp, 20
4D: 8B 3C 24 mov edi, [esp] |
50: 8B 74 24 04 mov esi, [esp+4] | mov eax, [esp]
54: 8B 4C 24 08 mov ecx, [esp+8] | mov edx, [esp+4]
58: 31 DF xor edi, ebx | xor eax, ebx
5A: 31 DE xor esi, ebx | xor edx, ebx
5C: 29 DF sub edi, ebx | sub eax, ebx
5E: 19 DE sbb esi, ebx | sbb edx, ebx
60: 31 E9 xor ecx, ebp |
62: E8 00 00 00 00 call @__security_check_cookie@4|
67: 89 F8 mov eax, edi |
69: 89 F2 mov edx, esi |
6B: 8D 65 F4 lea esp, [ebp-0Ch] | add esp, 8
6E: 5E pop esi |
6F: 5F pop edi |
70: 5B pop ebx | pop ebx
71: 5D pop ebp |
72: C3 ret | ret

clang generates 51 instructions, 18 more than properly optimised code,
tinkers with a stack cookie, although there is no array allocated on
the stack, and clobbers registers EDI and ESI without necessity.

long long __divdi3(long long dividend, long long divisor)
{
    long long r = divisor >> 63; // r = divisor < 0 ? -1 : 0
    long long s = dividend >> 63; // s = dividend < 0 ? -1 : 0
    divisor = (divisor ^ r) - r; // negate if divisor < 0
    dividend = (dividend ^ s) - s; // negate if dividend < 0
    s ^= r; // sign of quotient
                                    // negate if quotient < 0
    return (__udivmoddi4(dividend, divisor, 0) ^ s) - s;
}

__divdi3: # @__divdi3
        push ebx | push ebx
        push edi |
        push esi |
        mov ecx, dword ptr [esp + 28] | mov eax, [esp+20]
        mov eax, dword ptr [esp + 20] |
        mov edi, dword ptr [esp + 24] | mov ecx, [esp+16]
        mov ebx, dword ptr [esp + 16] |
        mov edx, ecx |
        mov esi, eax |
        sar edx, 31 | cdq
        sar esi, 31 | mov ebx, edx
        xor edi, edx | xor ecx, edx
        xor ecx, edx | xor eax, edx
        sub edi, edx | sub ecx, edx
        sbb ecx, edx | sbb eax, edx
        xor ebx, esi |
        xor eax, esi |
        sub ebx, esi |
        sbb eax, esi |
        xor esi, edx |
        sub esp, 12 # WTF? |
        push 0 | push 0
        push ecx | push eax
        push edi | push ecx
                                           > mov eax, [esp+24]
                                           > mov ecx, [esp+20]
                                           > cdq
                                           > xor ecx, edx
                                           > xor eax, edx
                                           > sub ecx, edx
                                           > sbb eax, edx
                                           > xor ebx, edx
        push eax | push eax
        push ebx | push ecx
        call __udivmoddi4 | call __udivmoddi4
        add esp, 32 | add esp, 20
        xor eax, esi | xor eax, ebx
        xor edx, esi | xor edx, ebx
        sub eax, esi | sub eax, ebx
        sbb edx, esi | sbb edx, ebx
        pop esi |
        pop edi |
        pop ebx | pop ebx
        ret | ret

clang generates 36 instructions, 6 more than properly optimised code,
tinkers with ESP and clobbers registers EDI and ESI without necessity.

stay tuned
Stefan Kanthak