Where's the optimiser gone? (part 3)

Hi @ll,

when called with "-O3 -target i386", the compiler fails to
generate properly optimised code (as shown on the right side)
for two of the following (rather trivial) functions:
(see <https://godbolt.org/z/tiwJqX&gt;\)

Especially notice the difference between the signed and unsigned
variants: the latter are properly optimised!

regards
Stefan Kanthak

--- sample3.c ---

__inline
long long Int32x32To64(long x, long y)
{
    return (long long) x * y;
}

long Int32x32To64Div32(long a, long b, long c)
{
    return Int32x32To64(a, b) / c;
}

Int32x32To64Div32: # @Int32x32To64Div32
    push ebp | push ebp
    mov ebp, esp | mov ebp, esp
    push esi
    mov eax, dword ptr [ebp + 12]
    mov ecx, dword ptr [ebp + 16] | mov eax, dword ptr [ebp + 16]
    imul dword ptr [ebp + 8] | cdq
    mov esi, ecx | push edx
    sar esi, 31 | push eax
    push esi | mov eax, dword ptr [ebp + 12]
    push ecx | imul dword ptr [ebp + 8]
    push edx | push edx
    push eax | push eax
    call __divdi3 | call __divdi3
    add esp, 16 | add esp, 16
    pop esi
    pop ebp | pop ebp
    ret | ret

long Int32x32To64Rem32(long a, long b, long c)
{
    return Int32x32To64(a, b) % c;
}

Int32x32To64Rem32: # @Int32x32To64Rem32
    push ebp | push ebp
    mov ebp, esp | mov ebp, esp
    push esi
    mov eax, dword ptr [ebp + 12]
    mov ecx, dword ptr [ebp + 16] | mov eax, dword ptr [ebp + 16]
    imul dword ptr [ebp + 8] | cdq
    mov esi, ecx | push edx
    sar esi, 31 | push eax
    push esi | mov eax, dword ptr [ebp + 12]
    push ecx | imul dword ptr [ebp + 8]
    push edx | push edx
    push eax | push eax
    call __moddi3 | call __modddi3
    add esp, 16 | add esp, 16
    pop esi
    pop ebp | pop ebp
    ret | ret

__inline
unsigned long long __fastcall UInt32x32To64(unsigned long x, unsigned long y)
{
    return (unsigned long long) x * y;
}

unsigned long UInt32x32To64Div32(unsigned long a, unsigned long b, unsigned long c)
{
    return UInt32x32To64(a, b) / c;
}

UInt32x32To64Div32: # @UInt32x32To64Div32
    push ebp
    mov ebp, esp
    mov eax, dword ptr [ebp + 12]
    mul dword ptr [ebp + 8]
    push 0
    push dword ptr [ebp + 16]
    push edx
    push eax
    call __udivdi3
    add esp, 16
    pop ebp
    ret

unsigned long UInt32x32To64Rem32(unsigned long a, unsigned long b, unsigned long c)
{
    return UInt32x32To64(a, b) % c;
}

UInt32x32To64Rem32: # @UInt32x32To64Rem32
    push ebp
    mov ebp, esp
    mov eax, dword ptr [ebp + 12]
    mul dword ptr [ebp + 8]
    push 0
    push dword ptr [ebp + 16]
    push edx
    push eax
    call __umoddi3
    add esp, 16
    pop ebp
    ret