No loop optimisation?

I saw a question on “why does clang do this” on Stack Overflow, and boiling it down to simple code, the following is the code:

size_t ret;

for(int i = 0; i < LOOP_COUNT; i++)

ret = sizeof(“abcd”);

Which I would expect to be optimsied into:

size_t ret = 5;

and the loop disappear.

Well, the assignment of ret is indeed removed, but the loop remains (in LLVM-IR).

If I rewrite it to:

size_t ret = sizeof(“abcd”);

for(int i = 0; i < LOOP_COUNT; i++)
;

then the empty loop is removed.

I can’t see a logical reason to keep “now empty” loops. Does anyone have a better idea than “it’s a bug”?

The original code was comparing:

for(int i = … )

ret = strlen(“abcd”);

and for this the loop DOES get optimised out, as does the sizeof loop on gcc.

Works fine here. How exactly did you compile it and which version of
clang are you using?

Joerg

I was using a pre-3.8 but not latest. Will pull latest and check again.

Also, if you look at IR make sure to run the appropiate passes first.
Clang output by default has very few optimisations in the -emit-llvm
output.

Joerg

That's intentional. Try -O2 and your IR will look much nicer. :slight_smile:

--renato

Just to be clear: I was using -O2, and the “first loop” which has strlen in it was optimised to a constant value and no loop, the second loop, with sizeof, was not “removed”.

Actual source below. Now with:

$ clang++ --version
clang version 3.8.0 (http://llvm.org/git/clang.git 27b8a29273862fa1e9e296be8f9850a1459115c8) (http://llvm.org/git/llvm.git 91950eea5582709ea263cc48bd97fdea817b53d1)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin

#include
#include

#define LOOP_COUNT 1000000000

unsigned long long rdtscl(void)
{
unsigned int lo, hi;
asm volatile (“rdtsc” : “=a”(lo), “=d”(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

int main()
{
unsigned long long before = rdtscl();
size_t ret;
for (int i = 0; i < LOOP_COUNT; i++)
ret = strlen(“abcd”);
unsigned long long after = rdtscl();
printf(“Strlen %lld ret=%zd\n”,(after - before), ret);

before = rdtscl();
for (int i = 0; i < LOOP_COUNT; i++)
ret = sizeof(“abcd”);
after = rdtscl();
printf(“Strlen %lld ret=%zd\n”,(after - before), ret);
}

llvm generated:

; Function Attrs: nounwind uwtable
define i32 @main() #0 {
entry:
%0 = tail call { i32, i32 } asm sideeffect “rdtsc”, “={ax},={dx},~{dirflag},~{fpsr},~{flags}”() #2, !srcloc !1
%asmresult1.i = extractvalue { i32, i32 } %0, 1
%conv2.i = zext i32 %asmresult1.i to i64
%shl.i = shl nuw i64 %conv2.i, 32
%asmresult.i = extractvalue { i32, i32 } %0, 0
%conv.i = zext i32 %asmresult.i to i64
%or.i = or i64 %shl.i, %conv.i
%1 = tail call { i32, i32 } asm sideeffect “rdtsc”, “={ax},={dx},~{dirflag},~{fpsr},~{flags}”() #2, !srcloc !1
%asmresult.i.25 = extractvalue { i32, i32 } %1, 0
%asmresult1.i.26 = extractvalue { i32, i32 } %1, 1
%conv.i.27 = zext i32 %asmresult.i.25 to i64
%conv2.i.28 = zext i32 %asmresult1.i.26 to i64
%shl.i.29 = shl nuw i64 %conv2.i.28, 32
%or.i.30 = or i64 %shl.i.29, %conv.i.27
%sub = sub i64 %or.i.30, %or.i
%call2 = tail call i32 (i8*, …) @printf(i8* nonnull getelementptr inbounds ([21 x i8], [21 x i8]* @.str, i64 0, i64 0), i64 %sub, i64 4)
%2 = tail call { i32, i32 } asm sideeffect “rdtsc”, “={ax},={dx},~{dirflag},~{fpsr},~{flags}”() #2, !srcloc !1
%asmresult1.i.32 = extractvalue { i32, i32 } %2, 1
%conv2.i.34 = zext i32 %asmresult1.i.32 to i64
%shl.i.35 = shl nuw i64 %conv2.i.34, 32
br label %for.cond.5

for.cond.5: ; preds = %for.cond.5, %entry
%i4.0 = phi i32 [ 0, %entry ], [ %inc10.18, %for.cond.5 ]
%inc10.18 = add nsw i32 %i4.0, 19
%exitcond.18 = icmp eq i32 %inc10.18, 1000000001
br i1 %exitcond.18, label %for.cond.cleanup.7, label %for.cond.5

for.cond.cleanup.7: ; preds = %for.cond.5
%asmresult.i.31 = extractvalue { i32, i32 } %2, 0
%conv.i.33 = zext i32 %asmresult.i.31 to i64
%or.i.36 = or i64 %shl.i.35, %conv.i.33
%3 = tail call { i32, i32 } asm sideeffect “rdtsc”, “={ax},={dx},~{dirflag},~{fpsr},~{flags}”() #2, !srcloc !1
%asmresult.i.37 = extractvalue { i32, i32 } %3, 0
%asmresult1.i.38 = extractvalue { i32, i32 } %3, 1
%conv.i.39 = zext i32 %asmresult.i.37 to i64
%conv2.i.40 = zext i32 %asmresult1.i.38 to i64
%shl.i.41 = shl nuw i64 %conv2.i.40, 32
%or.i.42 = or i64 %shl.i.41, %conv.i.39
%sub13 = sub i64 %or.i.42, %or.i.36
%call14 = tail call i32 (i8*, …) @printf(i8* nonnull getelementptr inbounds ([21 x i8], [21 x i8]* @.str, i64 0, i64 0), i64 %sub13, i64 5)
ret i32 0
}

Am I missing something?

Actual source below. Now with:

$ clang++ --version
clang version 3.8.0 (http://llvm.org/git/clang.git
27b8a29273862fa1e9e296be8f9850a1459115c8) (http://llvm.org/git/llvm.git
91950eea5582709ea263cc48bd97fdea817b53d1)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/local/bin

#include <cstdio>
#include <cstring>

#define LOOP_COUNT 1000000000

unsigned long long rdtscl(void)
{
    unsigned int lo, hi;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

int main()
{
    unsigned long long before = rdtscl();
    size_t ret;
    for (int i = 0; i < LOOP_COUNT; i++)
        ret = strlen("abcd");
    unsigned long long after = rdtscl();
    printf("Strlen %lld ret=%zd\n",(after - before), ret);

    before = rdtscl();
    for (int i = 0; i < LOOP_COUNT; i++)
        ret = sizeof("abcd");
    after = rdtscl();
    printf("Strlen %lld ret=%zd\n",(after - before), ret);
}

llvm generated:

; Function Attrs: nounwind uwtable
define i32 @main() #0 {
entry:
  %0 = tail call { i32, i32 } asm sideeffect "rdtsc",
"={ax},={dx},~{dirflag},~{fpsr},~{flags}"() #2, !srcloc !1
  %asmresult1.i = extractvalue { i32, i32 } %0, 1
  %conv2.i = zext i32 %asmresult1.i to i64
  %shl.i = shl nuw i64 %conv2.i, 32
  %asmresult.i = extractvalue { i32, i32 } %0, 0
  %conv.i = zext i32 %asmresult.i to i64
  %or.i = or i64 %shl.i, %conv.i
  %1 = tail call { i32, i32 } asm sideeffect "rdtsc",
"={ax},={dx},~{dirflag},~{fpsr},~{flags}"() #2, !srcloc !1
  %asmresult.i.25 = extractvalue { i32, i32 } %1, 0
  %asmresult1.i.26 = extractvalue { i32, i32 } %1, 1
  %conv.i.27 = zext i32 %asmresult.i.25 to i64
  %conv2.i.28 = zext i32 %asmresult1.i.26 to i64
  %shl.i.29 = shl nuw i64 %conv2.i.28, 32
  %or.i.30 = or i64 %shl.i.29, %conv.i.27
  %sub = sub i64 %or.i.30, %or.i
  %call2 = tail call i32 (i8*, ...) @printf(i8* nonnull getelementptr
inbounds ([21 x i8], [21 x i8]* @.str, i64 0, i64 0), i64 %sub, i64 4)
  %2 = tail call { i32, i32 } asm sideeffect "rdtsc",
"={ax},={dx},~{dirflag},~{fpsr},~{flags}"() #2, !srcloc !1
  %asmresult1.i.32 = extractvalue { i32, i32 } %2, 1
  %conv2.i.34 = zext i32 %asmresult1.i.32 to i64
  %shl.i.35 = shl nuw i64 %conv2.i.34, 32
  br label %for.cond.5

for.cond.5: ; preds = %for.cond.5,
%entry
  %i4.0 = phi i32 [ 0, %entry ], [ %inc10.18, %for.cond.5 ]
  %inc10.18 = add nsw i32 %i4.0, 19
  %exitcond.18 = icmp eq i32 %inc10.18, 1000000001
  br i1 %exitcond.18, label %for.cond.cleanup.7, label %for.cond.5

for.cond.cleanup.7: ; preds = %for.cond.5
  %asmresult.i.31 = extractvalue { i32, i32 } %2, 0
  %conv.i.33 = zext i32 %asmresult.i.31 to i64
  %or.i.36 = or i64 %shl.i.35, %conv.i.33
  %3 = tail call { i32, i32 } asm sideeffect "rdtsc",
"={ax},={dx},~{dirflag},~{fpsr},~{flags}"() #2, !srcloc !1
  %asmresult.i.37 = extractvalue { i32, i32 } %3, 0
  %asmresult1.i.38 = extractvalue { i32, i32 } %3, 1
  %conv.i.39 = zext i32 %asmresult.i.37 to i64
  %conv2.i.40 = zext i32 %asmresult1.i.38 to i64
  %shl.i.41 = shl nuw i64 %conv2.i.40, 32
  %or.i.42 = or i64 %shl.i.41, %conv.i.39
  %sub13 = sub i64 %or.i.42, %or.i.36
  %call14 = tail call i32 (i8*, ...) @printf(i8* nonnull getelementptr
inbounds ([21 x i8], [21 x i8]* @.str, i64 0, i64 0), i64 %sub13, i64 5)
  ret i32 0
}

Am I missing something?

Definitely looks like a bug. It's a consequence of both loops using the
same 'ret' variable. More specifically, the second loop body ends up with a
phi for the value of 'ret', whose value is either 4 or 5 depending on
whether the loop body is executed 0 times or more than 0 times, and the
existence of that phi prevents us from removing the second loop. (The phi
disappears once we remove the first loop, but we don't try to remove the
second loop again afterwards.)

[The 'add 19' is a comical red herring; we unroll the loop 19 times (as 19
is a largeish factor of the trip count), then fold all 19 'add 1'
instructions into one, but this happens after simplifycfg has already tried
and failed to remove the second loop.]

Ah, that explains why the “small example above” is not showing the problem…

Thanks for the explanation.

Shall I raise a bug?

Bug 25429 is for this.

Kun