LLVM optmization

The following C test program was compiled using LLVM with -O3 option and MSVC with /O2.
The MSVC one is about 600 times faster than the one compiled with the LLVM.
We can see that the for loop in MSVC assembler is solved in the optimization pass more efficiently than that in LLVM.
Is there an way to get a optimization result in LLVM like that of the MSVC?
Manoel Teixeira

#include <windows.h>
#include <stdio.h>

int TESTE ( int parami ,int paraml ,double paramd )
{
  int varx=0,vary=0;
  int nI =0;

    if( parami > 0 )
  {
   varx = parami;
   vary = 0;
  }
  else
  {
   varx = 0;
   vary = paraml;
  }
  for( nI = 1 ; nI <= paraml; nI++)
  {
    varx = varx + parami + 1 ;
  vary = varx + nI;
  }

return varx ;
}
unsigned long thread_call( LPVOID c )
{
  int num = 1;
  int (*fp)(int, int, double) = (int (*)(int, int,double)) c;
  //printf("\n(1)threadid = %ld seqt=%ld inum=%d",GetCurrentThreadId(),num,inum);
  int ret = fp(num,1000000000,1);
  printf("\n(2)leu %ld threadid = %ld seqt=%ld ",ret , GetCurrentThreadId(),num);
  return (unsigned long) ret;
}
///cronometro
unsigned long tini;
unsigned long tfim;
#define getmilisecs(x) (x)
#define num_th 100
unsigned long milisecs() { return getmilisecs(tfim-tini);};
unsigned long secs() { return milisecs()/1000;};
const char *spenttime ()
{
  static char buffer[64];
  unsigned long systime = secs();
  unsigned long milisectime = milisecs()%1000;
  sprintf(buffer,"%02d:%02d:%02d:%03d",systime/3600,(systime%3600)/60,(systime%3600)%60,milisectime);
  return (const char*) buffer;
};
//fim cronometro
int main(int a, char **b)
{
int i;
DWORD iThreadId;
HANDLE mainThread[num_th];
tfim = 0;
tini = GetTickCount();
for(i=0; i< num_th;i++)
    mainThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)thread_call, (LPVOID)TESTE, 0, (DWORD *)&iThreadId);
    
//WaitForMultipleObjects( num_th, (const HANDLE* )mainThread, TRUE, INFINITE);
  for( i=0; i < num_th; i++)
  {
  WaitForSingleObject( mainThread[i], INFINITE );
   CloseHandle(mainThread[i]);
  }
  tfim = GetTickCount();

printf("\n chamou = %s",spenttime () );
return 0;
}
//////////////////////////

; ModuleID = 'testeadvpl.c'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
target triple = "i386-mingw32"
  %struct._SECURITY_ATTRIBUTES = type { i32, i8*, i32 }
@tfim = common global i32 0 ; <i32*> [#uses=5]
@tini = common global i32 0 ; <i32*> [#uses=5]
@.str = internal constant [38 x i8] c"\0A(2)leu %ld threadid = %ld seqt=%ld \00" ; <[38 x i8]*> [#uses=1]
@buffer.30732 = internal global [64 x i8] zeroinitializer, align 32 ; <[64 x i8]*> [#uses=1]
@.str1 = internal constant [20 x i8] c"%02d:%02d:%02d:%03d\00" ; <[20 x i8]*> [#uses=1]
@.str2 = internal constant [14 x i8] c"\0A chamou = %s\00" ; <[14 x i8]*> [#uses=1]

define i32 @TESTE(i32 %parami, i32 %paraml, double %paramd) nounwind readnone {
entry:
  %0 = icmp sgt i32 %parami, 0 ; <i1> [#uses=1]
  %varx.0 = select i1 %0, i32 %parami, i32 0 ; <i32> [#uses=1]
  %1 = icmp slt i32 %paraml, 1 ; <i1> [#uses=1]
  br i1 %1, label %bb5, label %bb.nph

bb.nph: ; preds = %entry
  %2 = add i32 %parami, 1 ; <i32> [#uses=2]
  br label %bb3

bb3: ; preds = %bb3, %bb.nph
  %indvar = phi i32 [ 0, %bb.nph ], [ %indvar.next, %bb3 ] ; <i32> [#uses=3]
  %tmp = icmp slt i32 %parami, 0 ; <i1> [#uses=1]
  %smax = select i1 %tmp, i32 0, i32 %parami ; <i32> [#uses=1]
  %tmp11 = mul i32 %indvar, %2 ; <i32> [#uses=1]
  %varx.18 = add i32 %tmp11, %smax ; <i32> [#uses=1]
  %3 = add i32 %2, %varx.18 ; <i32> [#uses=1]
  %4 = add i32 %indvar, 2 ; <i32> [#uses=1]
  %5 = icmp sgt i32 %4, %paraml ; <i1> [#uses=1]
  %indvar.next = add i32 %indvar, 1 ; <i32> [#uses=1]
  br i1 %5, label %bb5, label %bb3

bb5: ; preds = %bb3, %entry
  %varx.1.lcssa = phi i32 [ %varx.0, %entry ], [ %3, %bb3 ] ; <i32> [#uses=1]
  ret i32 %varx.1.lcssa
}

define i32 @milisecs() nounwind readonly {
entry:
  %0 = load i32* @tfim, align 4 ; <i32> [#uses=1]
  %1 = load i32* @tini, align 4 ; <i32> [#uses=1]
  %2 = sub i32 %0, %1 ; <i32> [#uses=1]
  ret i32 %2
}

define i32 @thread_call(i8* %c) nounwind {
entry:
  %0 = bitcast i8* %c to i32 (i32, i32, double)* ; <i32 (i32, i32, double)*> [#uses=1]
  %1 = tail call i32 %0(i32 1, i32 1000000000, double 1.000000e+000) nounwind ; <i32> [#uses=2]
  %2 = tail call x86_stdcallcc i32 @GetCurrentThreadId() nounwind ; <i32> [#uses=1]
  %3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([38 x i8]* @.str, i32 0, i32 0), i32 %1, i32 %2, i32 1) nounwind ; <i32> [#uses=0]
  ret i32 %1
}

declare x86_stdcallcc i32 @GetCurrentThreadId()

declare i32 @printf(i8*, ...) nounwind

define i32 @secs() nounwind readonly {
entry:
  %0 = load i32* @tfim, align 4 ; <i32> [#uses=1]
  %1 = load i32* @tini, align 4 ; <i32> [#uses=1]
  %2 = sub i32 %0, %1 ; <i32> [#uses=1]
  %3 = udiv i32 %2, 1000 ; <i32> [#uses=1]
  ret i32 %3
}

define i8* @spenttime() nounwind {
entry:
  %0 = load i32* @tfim, align 4 ; <i32> [#uses=1]
  %1 = load i32* @tini, align 4 ; <i32> [#uses=1]
  %2 = sub i32 %0, %1 ; <i32> [#uses=3]
  %3 = udiv i32 %2, 1000 ; <i32> [#uses=1]
  %4 = urem i32 %2, 1000 ; <i32> [#uses=1]
  %5 = urem i32 %3, 3600 ; <i32> [#uses=2]
  %6 = urem i32 %5, 60 ; <i32> [#uses=1]
  %7 = udiv i32 %5, 60 ; <i32> [#uses=1]
  %8 = udiv i32 %2, 3600000 ; <i32> [#uses=1]
  %9 = tail call i32 (i8*, i8*, ...)* @sprintf(i8* getelementptr ([64 x i8]* @buffer.30732, i32 0, i32 0), i8* getelementptr ([20 x i8]* @.str1, i32 0, i32 0), i32 %8, i32 %7, i32 %6, i32 %4) nounwind ; <i32> [#uses=0]
  ret i8* getelementptr ([64 x i8]* @buffer.30732, i32 0, i32 0)
}

declare i32 @sprintf(i8*, i8*, ...) nounwind

define i32 @main(i32 %a, i8** %b) nounwind {
entry:
  %mainThread = alloca [100 x i8*] ; <[100 x i8*]*> [#uses=2]
  %iThreadId = alloca i32 ; <i32*> [#uses=1]
  store i32 0, i32* @tfim, align 4
  %0 = call x86_stdcallcc i32 @GetTickCount() nounwind ; <i32> [#uses=1]
  store i32 %0, i32* @tini, align 4
  br label %bb

bb: ; preds = %bb, %entry
  %i.0.reg2mem.0 = phi i32 [ 0, %entry ], [ %indvar.next14, %bb ] ; <i32> [#uses=2]
  %1 = call x86_stdcallcc i8* @CreateThread(%struct._SECURITY_ATTRIBUTES* null, i32 0, i32 (i8*)* @thread_call, i8* bitcast (i32 (i32, i32, double)* @TESTE to i8*), i32 0, i32* %iThreadId) nounwind ; <i8*> [#uses=1]
  %2 = getelementptr [100 x i8*]* %mainThread, i32 0, i32 %i.0.reg2mem.0 ; <i8**> [#uses=1]
  store i8* %1, i8** %2, align 4
  %indvar.next14 = add i32 %i.0.reg2mem.0, 1 ; <i32> [#uses=2]
  %exitcond15 = icmp eq i32 %indvar.next14, 100 ; <i1> [#uses=1]
  br i1 %exitcond15, label %bb3, label %bb

bb3: ; preds = %bb3, %bb
  %i.1.reg2mem.0 = phi i32 [ 0, %bb ], [ %indvar.next, %bb3 ] ; <i32> [#uses=2]
  %3 = getelementptr [100 x i8*]* %mainThread, i32 0, i32 %i.1.reg2mem.0 ; <i8**> [#uses=2]
  %4 = load i8** %3, align 4 ; <i8*> [#uses=1]
  %5 = call x86_stdcallcc i32 @WaitForSingleObject(i8* %4, i32 -1) nounwind ; <i32> [#uses=0]
  %6 = load i8** %3, align 4 ; <i8*> [#uses=1]
  %7 = call x86_stdcallcc i32 @CloseHandle(i8* %6) nounwind ; <i32> [#uses=0]
  %indvar.next = add i32 %i.1.reg2mem.0, 1 ; <i32> [#uses=2]
  %exitcond = icmp eq i32 %indvar.next, 100 ; <i1> [#uses=1]
  br i1 %exitcond, label %bb5, label %bb3

bb5: ; preds = %bb3
  %8 = call x86_stdcallcc i32 @GetTickCount() nounwind ; <i32> [#uses=2]
  store i32 %8, i32* @tfim, align 4
  %9 = load i32* @tini, align 4 ; <i32> [#uses=1]
  %10 = sub i32 %8, %9 ; <i32> [#uses=3]
  %11 = udiv i32 %10, 1000 ; <i32> [#uses=1]
  %12 = urem i32 %10, 1000 ; <i32> [#uses=1]
  %13 = urem i32 %11, 3600 ; <i32> [#uses=2]
  %14 = urem i32 %13, 60 ; <i32> [#uses=1]
  %15 = udiv i32 %13, 60 ; <i32> [#uses=1]
  %16 = udiv i32 %10, 3600000 ; <i32> [#uses=1]
  %17 = call i32 (i8*, i8*, ...)* @sprintf(i8* getelementptr ([64 x i8]* @buffer.30732, i32 0, i32 0), i8* getelementptr ([20 x i8]* @.str1, i32 0, i32 0), i32 %16, i32 %15, i32 %14, i32 %12) nounwind ; <i32> [#uses=0]
  %18 = call i32 (i8*, ...)* @printf(i8* getelementptr ([14 x i8]* @.str2, i32 0, i32 0), i8* getelementptr ([64 x i8]* @buffer.30732, i32 0, i32 0)) nounwind ; <i32> [#uses=0]
  ret i32 0
}

declare x86_stdcallcc i32 @GetTickCount()

declare x86_stdcallcc i8* @CreateThread(%struct._SECURITY_ATTRIBUTES*, i32, i32 (i8*)*, i8*, i32, i32*)

declare x86_stdcallcc i32 @WaitForSingleObject(i8*, i32)

declare x86_stdcallcc i32 @CloseHandle(i8*)
////////////////////////
; Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.50727.762

  TITLE C:\msys\1.0\home\mteixeira\testeadvpl.c
  .686P
  .XMM
  include listing.inc
  .model flat

INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES

_DATA SEGMENT
COMM _tini:DWORD
COMM _tfim:DWORD
_DATA ENDS
PUBLIC _TESTE
; Function compile flags: /Ogtpy
; File c:\msys\1.0\home\mteixeira\testeadvpl.c
; COMDAT _TESTE
_TEXT SEGMENT
_parami$ = 8 ; size = 4
_paraml$ = 12 ; size = 4
_paramd$ = 16 ; size = 8
_TESTE PROC ; COMDAT

; 6 : int varx=0,vary=0;
; 7 : int nI =0;
; 8 : //varx= parami;
; 9 : if( parami > 0 )

  mov ecx, DWORD PTR _parami$[esp-4]

; 10 : {
; 11 : varx = parami;
; 12 : vary = 0;
; 13 : }
; 14 : else
; 15 : {
; 16 : varx = 0;
; 17 : vary = paraml;
; 18 : }
; 19 : for( nI = 1 ; nI <= paraml; nI++)

  mov edx, DWORD PTR _paraml$[esp-4]
  xor eax, eax
  test ecx, ecx
  setle al
  sub eax, 1
  and eax, ecx
  cmp edx, 1
  jl SHORT $LN3@TESTE
  add ecx, 1
  imul ecx, edx
  add eax, ecx
$LN3@TESTE:

; 20 : {
; 21 : varx = varx + parami + 1 ;
; 22 : vary = varx + nI;
; 23 : }
; 24 :
; 25 : return varx ;
; 26 : }

  ret 0
_TESTE ENDP
_TEXT ENDS
PUBLIC ??_C@_0CG@LBAPCNHJ@?6?$CI2?$CJleu?5?$CFld?5threadid?5?$DN?5?5?$CFld?5seqt@ ; `string'
PUBLIC __real@3ff0000000000000
PUBLIC _thread_call
EXTRN _printf:PROC
EXTRN __imp__GetCurrentThreadId@0:PROC
EXTRN __fltused:DWORD
; COMDAT ??_C@_0CG@LBAPCNHJ@?6?$CI2?$CJleu?5?$CFld?5threadid?5?$DN?5?5?$CFld?5seqt@
CONST SEGMENT
??_C@_0CG@LBAPCNHJ@?6?$CI2?$CJleu?5?$CFld?5threadid?5?$DN?5?5?$CFld?5seqt@ DB 0aH
  DB '(2)leu %ld threadid = %ld seqt=%ld ', 00H ; `string'
CONST ENDS
; COMDAT __real@3ff0000000000000
CONST SEGMENT
__real@3ff0000000000000 DQ 03ff0000000000000r ; 1
; Function compile flags: /Ogtpy
CONST ENDS
; COMDAT _thread_call
_TEXT SEGMENT
_c$ = 8 ; size = 4
_thread_call PROC ; COMDAT

; 29 : int num = 1;
; 30 : int (*fp)(int, int, double) = (int (*)(int, int,double)) c;
; 31 : //printf("\n(1)threadid = %ld seqt=%ld inum=%d",GetCurrentThreadId(),num,inum);
; 32 : int ret = fp(num,1000000000,1);

  fld1
  push esi
  sub esp, 8
  fstp QWORD PTR [esp]
  push 1000000000 ; 3b9aca00H
  push 1
  call DWORD PTR _c$[esp+16]
  add esp, 16 ; 00000010H

; 33 : printf("\n(2)leu %ld threadid = %ld seqt=%ld ",ret , GetCurrentThreadId(),num);

  push 1
  mov esi, eax
  call DWORD PTR __imp__GetCurrentThreadId@0
  push eax
  push esi
  push OFFSET ??_C@_0CG@LBAPCNHJ@?6?$CI2?$CJleu?5?$CFld?5threadid?5?$DN?5?5?$CFld?5seqt@
  call _printf
  add esp, 16 ; 00000010H

; 34 : return (unsigned long) ret;

  mov eax, esi
  pop esi

; 35 : }

  ret 0
_thread_call ENDP
_TEXT ENDS
PUBLIC _milisecs
; Function compile flags: /Ogtpy
; COMDAT _milisecs
_TEXT SEGMENT
_milisecs PROC ; COMDAT

; 41 : unsigned long milisecs() { return getmilisecs(tfim-tini);};

  mov eax, DWORD PTR _tfim
  sub eax, DWORD PTR _tini
  ret 0
_milisecs ENDP
_TEXT ENDS
PUBLIC _secs
; Function compile flags: /Ogtpy
; COMDAT _secs
_TEXT SEGMENT
_secs PROC ; COMDAT

; 42 : unsigned long secs() { return milisecs()/1000;};

  mov ecx, DWORD PTR _tfim
  sub ecx, DWORD PTR _tini
  mov eax, 274877907 ; 10624dd3H
  mul ecx
  shr edx, 6
  mov eax, edx
  ret 0
_secs ENDP
_TEXT ENDS
PUBLIC ??_C@_0BE@FFMOMMDD@?$CF02d?3?$CF02d?3?$CF02d?3?$CF03d?$AA@ ; `string'
PUBLIC _spenttime
EXTRN _sprintf:PROC
_BSS SEGMENT
?buffer@?1??spenttime@@9@9 DB 040H DUP (?) ; `spenttime'::`2'::buffer
_BSS ENDS
; COMDAT ??_C@_0BE@FFMOMMDD@?$CF02d?3?$CF02d?3?$CF02d?3?$CF03d?$AA@
CONST SEGMENT
??_C@_0BE@FFMOMMDD@?$CF02d?3?$CF02d?3?$CF02d?3?$CF03d?$AA@ DB '%02d:%02d:'
  DB '%02d:%03d', 00H ; `string'
; Function compile flags: /Ogtpy
CONST ENDS
; COMDAT _spenttime
_TEXT SEGMENT
_spenttime PROC ; COMDAT

; 45 : static char buffer[64];
; 46 : unsigned long systime = secs();

  mov eax, DWORD PTR _tfim
  sub eax, DWORD PTR _tini
  xor edx, edx
  mov ecx, 1000 ; 000003e8H
  div ecx
  push esi
  push edi
  mov esi, eax
  mov edi, edx

; 47 : unsigned long milisectime = milisecs()%1000;
; 48 : sprintf(buffer,"%02d:%02d:%02d:%03d",systime/3600,(systime%3600)/60,(systime%3600)%60,milisectime);

  mov eax, -1851608123 ; 91a2b3c5H
  mul esi
  mov ecx, edx
  shr ecx, 11 ; 0000000bH
  mov edx, ecx
  imul edx, 3600 ; 00000e10H
  mov eax, esi
  sub eax, edx
  xor edx, edx
  mov esi, 60 ; 0000003cH
  div esi
  push edi
  push edx
  push eax
  push ecx
  push OFFSET ??_C@_0BE@FFMOMMDD@?$CF02d?3?$CF02d?3?$CF02d?3?$CF03d?$AA@
  push OFFSET ?buffer@?1??spenttime@@9@9
  call _sprintf
  add esp, 24 ; 00000018H
  pop edi

; 49 : return (const char*) buffer;

  mov eax, OFFSET ?buffer@?1??spenttime@@9@9
  pop esi

; 50 : };

  ret 0
_spenttime ENDP
_TEXT ENDS
PUBLIC ??_C@_0O@BKPPOCPE@?6?5chamou?5?$DN?5?$CFs?$AA@ ; `string'
PUBLIC _main
EXTRN __imp__CloseHandle@4:PROC
EXTRN __imp__WaitForSingleObject@8:PROC
EXTRN __imp__CreateThread@24:PROC
EXTRN __imp__GetTickCount@0:PROC
; COMDAT ??_C@_0O@BKPPOCPE@?6?5chamou?5?$DN?5?$CFs?$AA@
CONST SEGMENT
??_C@_0O@BKPPOCPE@?6?5chamou?5?$DN?5?$CFs?$AA@ DB 0aH, ' chamou = %s', 00H ; `string'
; Function compile flags: /Ogtpy
CONST ENDS
; COMDAT _main
_TEXT SEGMENT
_iThreadId$ = -404 ; size = 4
_mainThread$ = -400 ; size = 400
_a$ = 8 ; size = 4
_b$ = 12 ; size = 4
_main PROC ; COMDAT

; 53 : {

  sub esp, 404 ; 00000194H
  push ebx
  push ebp
  push esi
  push edi

; 54 : int i;
; 55 : DWORD iThreadId;
; 56 : HANDLE mainThread[num_th];
; 57 : tfim = 0;

  mov DWORD PTR _tfim, 0

; 58 : tini = GetTickCount();

  call DWORD PTR __imp__GetTickCount@0

; 59 : for(i=0; i< num_th;i++)

  mov edi, DWORD PTR __imp__CreateThread@24
  mov DWORD PTR _tini, eax
  xor esi, esi
$LL6@main:

; 60 : mainThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)thread_call, (LPVOID)TESTE, 0, (DWORD *)&iThreadId);

  lea eax, DWORD PTR _iThreadId$[esp+420]
  push eax
  push 0
  push OFFSET _TESTE
  push OFFSET _thread_call
  push 0
  push 0
  call edi
  mov DWORD PTR _mainThread$[esp+esi*4+420], eax
  add esi, 1
  cmp esi, 100 ; 00000064H
  jl SHORT $LL6@main

; 61 :
; 62 : //WaitForMultipleObjects( num_th, (const HANDLE* )mainThread, TRUE, INFINITE);
; 63 : for( i=0; i < num_th; i++)

  mov ebx, DWORD PTR __imp__WaitForSingleObject@8
  mov ebp, DWORD PTR __imp__CloseHandle@4
  xor esi, esi
$LL3@main:

; 64 : {
; 65 : WaitForSingleObject( mainThread[i], INFINITE );

  mov edi, DWORD PTR _mainThread$[esp+esi*4+420]
  push -1
  push edi
  call ebx

; 66 : CloseHandle(mainThread[i]);

  push edi
  call ebp
  add esi, 1
  cmp esi, 100 ; 00000064H
  jl SHORT $LL3@main

; 67 : }
; 68 : tfim = GetTickCount();

  call DWORD PTR __imp__GetTickCount@0
  mov DWORD PTR _tfim, eax

; 69 :
; 70 : printf("\n chamou = %s",spenttime () );

  sub eax, DWORD PTR _tini
  xor edx, edx
  mov ecx, 1000 ; 000003e8H
  div ecx
  mov esi, eax
  mov edi, edx
  mov eax, -1851608123 ; 91a2b3c5H
  mul esi
  mov ecx, edx
  shr ecx, 11 ; 0000000bH
  mov edx, ecx
  imul edx, 3600 ; 00000e10H
  mov eax, esi
  sub eax, edx
  xor edx, edx
  mov esi, 60 ; 0000003cH
  div esi
  push edi
  push edx
  push eax
  push ecx
  push OFFSET ??_C@_0BE@FFMOMMDD@?$CF02d?3?$CF02d?3?$CF02d?3?$CF03d?$AA@
  push OFFSET ?buffer@?1??spenttime@@9@9
  call _sprintf
  push OFFSET ?buffer@?1??spenttime@@9@9
  push OFFSET ??_C@_0O@BKPPOCPE@?6?5chamou?5?$DN?5?$CFs?$AA@
  call _printf
  add esp, 32 ; 00000020H
  pop edi
  pop esi
  pop ebp

; 71 : return 0;

  xor eax, eax
  pop ebx

; 72 : }

  add esp, 404 ; 00000194H
  ret 0
_main ENDP
_TEXT ENDS
END

The following C test program was compiled using LLVM with -O3 option and MSVC with /O2.
The MSVC one is about 600 times faster than the one compiled with the LLVM.
We can see that the for loop in MSVC assembler is solved in the optimization pass more efficiently than that in LLVM.
Is there an way to get a optimization result in LLVM like that of the MSVC?
Manoel Teixeira

#include <windows.h>
#include <stdio.h>

int TESTE ( int parami ,int paraml ,double paramd )
{
int varx=0,vary=0;
int nI =0;

   if( parami > 0 )
       {
  varx = parami;
  vary = 0;
       }
else
{
  varx = 0;
  vary = paraml;
}
for( nI = 1 ; nI <= paraml; nI++)
{
   varx = varx + parami + 1 ;
       vary = varx + nI;
}

return varx ;
}
unsigned long thread_call( LPVOID c )
{
       int num = 1;
       int (*fp)(int, int, double) = (int (*)(int, int,double)) c;
       //printf("\n(1)threadid = %ld seqt=%ld inum=%d",GetCurrentThreadId(),num,inum);
       int ret = fp(num,1000000000,1);
       printf("\n(2)leu %ld threadid = %ld seqt=%ld ",ret , GetCurrentThreadId(),num);
       return (unsigned long) ret;
}
///cronometro
unsigned long tini;
unsigned long tfim;
#define getmilisecs(x) (x)
#define num_th 100
unsigned long milisecs() { return getmilisecs(tfim-tini);};
unsigned long secs() { return milisecs()/1000;};
const char *spenttime ()
{
       static char buffer[64];
       unsigned long systime = secs();
       unsigned long milisectime = milisecs()%1000;
       sprintf(buffer,"%02d:%02d:%02d:%03d",systime/3600,(systime%3600)/60,(systime%3600)%60,milisectime);
       return (const char*) buffer;
};
//fim cronometro
int main(int a, char **b)
{
int i;
DWORD iThreadId;
HANDLE mainThread[num_th];
tfim = 0;
tini = GetTickCount();

You're starting your count ...

for(i=0; i< num_th;i++)
               mainThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)thread_call, (LPVOID)TESTE, 0, (DWORD *)&iThreadId);

While doing a thread create method (which I don't have on my darwin
box) that calls "printf". "printf" is I/O bound and makes for a lousy
performance test.

Nevertheless, I've attached the .s file generated by LLVM from the IR
you gave. I can't see anything obviously wrong with it. Please point
out in it which parts are 600x slower than on Windows. I'm not able to
run it because I don't have a Windows box, and it requires some
Windows calls.

Note: The ASM is for a Darwin box. I didn't generate this code with
frame pointers disabled. This would improve performance, but you
didn't mention that you did the same for your Windows compile.

-bw

y.s (3.89 KB)

Hi Manoel,

The following C test program was compiled using LLVM with -O3 option and MSVC with /O2.
The MSVC one is about 600 times faster than the one compiled with the LLVM.
We can see that the for loop in MSVC assembler is solved in the optimization pass more efficiently than that in LLVM.
Is there an way to get a optimization result in LLVM like that of the MSVC?

can you please provide a testcase that doesn't need windows.
I don't think many people around here have windows...

Ciao,

Duncan.

Hi Manoel,

Hi, Duncan.

Here an example :
#include <stdio.h>
#include <stdlib.h>
//
int TESTE ( int parami ,int paraml )
{
  int varx=0;
  int nI =0;
  
  if( parami > 0 )
  {
    varx = parami;
  }
  else
  {
    varx = 1;
   }
  
  for( nI = 1 ; nI <= paraml; nI++)
  {
    varx = varx + parami+ 1;
  }

return varx ;
}

int main(int argc, char **argv)
{
   if( argc < 3 )
     return 0;
   return TESTE(atoi(argv[1]),atoi(argv[2]));
}

I don't know how gcc4 resolves the for loop, but the MSVC is great in this example.

gcc-4.3 also eliminates the loop entirely and directly calculates the value of varx.

If I change the paraml and put a constant, the LLVM optmizes very well.

It looks like LLVM's scalar evolution code isn't handling your testcase well.

Ciao,

Duncan.