missing optimization opportunity for const std::vector compared to std::array

i've written this small testprogram to test the gcc4.8.1 optimizer and found a optimization opportunity

for details see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58483

then i compared the gcc results to clang 3.3 and found that the optimization of llvm "seems" to be far aways from the gcc results in this case

--- test.cpp ---
#include <vector>
#include <numeric>
#include <array>

static int calc(const std::array<int,3> p_ints, const int& p_init)
//static int calc(const std::vector<int> p_ints, const int& p_init)
{
   return std::accumulate(p_ints.begin(), p_ints.end(), p_init);
}

int main()
{
   const int result = calc({10,20,30},100);
   return result;
}

gcc-optimizer-result using std::array

main:
     mov eax, 160
     ret

gcc-optimizer result using std::vector

main:
     push rbx
     mov edi, 12
     call operator new(unsigned long)
     mov rdx, QWORD PTR ._81[rip]
     mov rdi, rax
     mov QWORD PTR [rax], rdx
     mov eax, DWORD PTR ._81[rip+8]
     mov rsi, rdx
     shr rsi, 32
     lea ebx, [rsi+100+rdx]
     add ebx, eax
     test rdi, rdi
     mov DWORD PTR [rdi+8], eax
     je .L2
     call operator delete(void*)
.L2:
     mov eax, ebx
     pop rbx
     ret
._81:
     .long 10
     .long 20
     .long 30

the clang 3.3 results for -O3 -march=native -std=c++11

using std::array

main: # @main
     movabsq $85899345930, %rax # imm = 0x140000000A
     movq %rax, -16(%rsp)
     movl $100, %esi
     movl $30, -8(%rsp)
     xorl %edx, %edx
     leaq -16(%rsp), %rcx
     movb $1, %al
     testb %al, %al
     jne .LBB0_1
     movd %esi, %xmm1
     pxor %xmm0, %xmm0
     xorl %eax, %eax
.LBB0_3: # %vector.body.i.i
     movdqu (%rsp,%rax,4), %xmm2
     paddd %xmm2, %xmm0
     movdqu -16(%rsp,%rax,4), %xmm2
     paddd %xmm2, %xmm1
     addq $8, %rax
     cmpq %rax, %rdx
     jne .LBB0_3
     jmp .LBB0_4
.LBB0_1:
     pxor %xmm0, %xmm0
     movd %esi, %xmm1
.LBB0_4: # %middle.block.i.i
     movl $3, %esi
     paddd %xmm1, %xmm0
     movdqa %xmm0, %xmm1
     movhlps %xmm1, %xmm1 # xmm1 = xmm1[1,1]
     paddd %xmm0, %xmm1
     phaddd %xmm1, %xmm1
     movd %xmm1, %eax
     cmpq %rdx, %rsi
     je .LBB0_7
     addq $-12, %rcx
     leaq -16(%rsp), %rdx
.LBB0_6: # %scalar.ph.i.i
     addl 12(%rcx), %eax
     addq $4, %rcx
     cmpq %rcx, %rdx
     jne .LBB0_6
.LBB0_7: # %_ZL4calcSt5arrayIiLm3EERKi.exit
     ret

using std::vector

main: # @main
     pushq %rbx
     movl $12, %edi
     callq operator new(unsigned long)
     movabsq $85899345930, %rcx # imm = 0x140000000A
     movq %rcx, (%rax)
     xorl %ecx, %ecx
     movl $3, %edx
     movl $100, %esi
     movl $30, 8(%rax)
     movb $1, %bl
     movd %esi, %xmm1
     pxor %xmm0, %xmm0
     testb %bl, %bl
     jne .LBB0_3
     xorl %esi, %esi
.LBB0_2: # %vector.body.i.i
     movdqu 16(%rax,%rsi,4), %xmm2
     paddd %xmm2, %xmm0
     movdqu (%rax,%rsi,4), %xmm2
     paddd %xmm2, %xmm1
     addq $8, %rsi
     cmpq %rsi, %rcx
     jne .LBB0_2
.LBB0_3: # %middle.block.i.i
     paddd %xmm1, %xmm0
     movdqa %xmm0, %xmm1
     movhlps %xmm1, %xmm1 # xmm1 = xmm1[1,1]
     paddd %xmm0, %xmm1
     phaddd %xmm1, %xmm1
     movd %xmm1, %ebx
     cmpq %rcx, %rdx
     je .LBB0_6
     movq %rax, %rcx
     addq $-12, %rcx
.LBB0_5: # %scalar.ph.i.i
     addl 12(%rcx), %ebx
     addq $4, %rcx
     cmpq %rcx, %rax
     jne .LBB0_5
.LBB0_6: # %_ZL4calcSt6vectorIiSaIiEERKi.exit
     testq %rax, %rax
     je .LBB0_8
     movq %rax, %rdi
     callq operator delete(void*)
.LBB0_8: # %_ZNSt6vectorIiSaIiEED1Ev.exit
     movl %ebx, %eax
     popq %rbx
     ret

is the llvm optimizer not able to optimize this better, is that a better result or do i something wrong here

thx

i've written this small testprogram to test the gcc4.8.1 optimizer and found a optimization opportunity

for details see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58483

then i compared the gcc results to clang 3.3 and found that the optimization of llvm "seems" to be far aways from the gcc results in this case

What you're seeing in the std::array case is likely an artifact of running the loop vectorizer too early in LLVM 3.3. This is fixed in trunk. The std::vector case is trickier, I think the main problem here is that the optimizer wasn't able to see that the loop inside std::accumulate has a constant trip count.

- Ben

> i've written this small testprogram to test the gcc4.8.1 optimizer and found a optimization opportunity
>
> for details see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58483
>
> then i compared the gcc results to clang 3.3 and found that the optimization of llvm "seems" to be far aways from the gcc results in this case

What you're seeing in the std::array case is likely an artifact of running the loop vectorizer too early in LLVM 3.3. This is fixed in trunk.

so the result is also

return 160;

for the trunk version?

The std::vector case is trickier, I think the main problem here is that the optimizer wasn't able to see that the loop inside std::accumulate has a constant trip count.

so the problem in the llvm optimizer is different to the gcc one?

the gcc understands the constant trip count, but threre is no rule for removing unneeded allocation with new/delete
the gcc can only remove malloc/free - but its ongoing work

> i've written this small testprogram to test the gcc4.8.1 optimizer and found a optimization opportunity
>
> for details see http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58483
>
> then i compared the gcc results to clang 3.3 and found that the optimization of llvm "seems" to be far aways from the gcc results in this case

What you're seeing in the std::array case is likely an artifact of running the loop vectorizer too early in LLVM 3.3. This is fixed in trunk.

so the result is also

return 160;

for the trunk version?

Yes, it should catch this case.

The std::vector case is trickier, I think the main problem here is that the optimizer wasn't able to see that the loop inside std::accumulate has a constant trip count.

so the problem in the llvm optimizer is different to the gcc one?

the gcc understands the constant trip count, but threre is no rule for removing unneeded allocation with new/delete
the gcc can only remove malloc/free - but its ongoing work

The trip count computation is just a bug and I'm working on fixing that, then the code will be a lot cleaner. Sadly, removing the allocation is harder. LLVM doesn't seem to be able to forward the values copied into the vector to the values taken out of the vector. If that's done the allocation could be deleted as dead code, LLVM already understands new/delete.

- Ben

sounds all very good - i hope to see these bugs getting fixed, the "forward the values copied into the vector" problems seems to be also evil for other situations of optimization

If you're interested in what we can do about those cases I can recommend Chandler Carruth's keynote from http://llvm.org/devmtg/2013-04/ . It discusses what's so hard about optimizing simple std::vector operations.

- Ben

thx for the keynote link (how can i missed that one?)

i've rechecked against

gcc.http://gcc.godbolt.org/
*clang version 3.4.1
*gcc 4.9 20130909
with -O3 -std=c++11

gcc does not optimize down to result 160 and does not remove the new/deletes

main:
  pushq %rbx
  movl $12, %edi
  call operator new(unsigned long)
  movq ._41(%rip), %rdx
  movq %rax, %rdi
  movq %rdx, %rsi
  movq %rdx, (%rax)
  movl ._41+8(%rip), %eax
  shrq $32, %rsi
  leal 100(%rsi,%rdx), %ebx
  movl %eax, 8(%rdi)
  addl %eax, %ebx
  testq %rdi, %rdi
  je .L2
  call operator delete(void*)
.L2:
  movl %ebx, %eax
  popq %rbx
  ret
._41:
  .long 10
  .long 20
  .long 30

clang does optimize down to result 160 but still not remove the new/deletes

main: # @main
  pushq %rax
  movl $12, %edi
  callq operator new(unsigned long)
  testq %rax, %rax
  movabsq $85899345930, %rcx # imm = 0x140000000A
  movq %rcx, (%rax)
  movl $30, 8(%rax)
  je .LBB0_2
  movq %rax, %rdi
  callq operator delete(void*)
.LBB0_2: # %_ZNSt6vectorIiSaIiEED2Ev.exit
  movl $160, %eax
  popq %rdx
  ret

so Bens "Sadly, removing the allocation is harder." seems to be still hard

another bigger example - clang does not get result/remove allocs, but the
resulting code is much smaller then gcc 4.9.0

This is close to the best result that a conforming compiler can give (for
the standard library implementation you're using). We're not *permitted* to
remove those operator new/operator delete calls by the standard, because
operator new and operator delete can be overridden by the user.

The standard (as of only a few months ago) allows operator new/operator
delete calls such as the above to be removed, but *only* if they came from
new-expressions and delete-expressions. Under the covers, std::vector uses
direct calls to ::operator new and ::operator delete, so these provisions
do not apply.

So... we should add some way for std::allocator to say "allocate/deallocate
memory like a new-expression", and use it in libc++. I think we should add

  void *__builtin_operator_new(size_t)
  void __builtin_operator_delete(void*)

for this.

sounds like an good idea

but i do not understand why its so hard to track overridden allocations - or why
they can't be removed - in the end there is an malloc/free or __builtin_operator_new/delete

Well, consider the following code:

  int * foo () { return new int; }

Does that call the default operator new?
Or some overridden one?

The compiler *can’t tell*.
In some other compilation unit, the user could have

#include <new>

char buffer[10240];
int offset = 0;

void * operator new (size_t sz) throw(std::bad_alloc) {
  void *p = buffer + offset;
  offset += sz;
  return p;
  }

and it’s up to the linker to figure that out.

— Marshall

Clang now supports these, and I have a patch out for review to make libc++
use them. With the patch applied, clang+libc++ optimizes your code down to
just 'return 160' for both the std::array and std::vector cases.

unbelievably fast integration :slight_smile: - thanks alot for taking the time

how well does your patch play with

gcc.http://gcc.godbolt.org, clang version 3.4.1 -O3 -std=c++11

#include <string>
int main()
{
   return std::string("hello").size();
}

results in:

main: # @main
     pushq %rbx
     subq $32, %rsp
     leaq 16(%rsp), %rdi
     leaq 8(%rsp), %rdx
     movl $.L.str, %esi
     callq std::basic_string<char, std::char_traits<char>, std::allocator<char> >::basic_string(char const*, std::allocator<char> const&)
     movq 16(%rsp), %rax
     leaq -24(%rax), %rdi
     movl std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep::_S_empty_rep_storage, %ecx
     cmpq %rcx, %rdi
     movl -24(%rax), %ebx
     jne .LBB0_1
.LBB0_6: # %_ZNSsD1Ev.exit
     movl %ebx, %eax
     addq $32, %rsp
     popq %rbx
     ret
.LBB0_1:
     addq $-8, %rax
     movl $__pthread_key_create, %ecx
     testq %rcx, %rcx
     je .LBB0_3
     movl $-1, %ecx
     lock
     xaddl %ecx, (%rax)
     movl %ecx, 28(%rsp)
     movl 28(%rsp), %ecx
     jmp .LBB0_4
.LBB0_3:
     movl (%rax), %ecx
     leal -1(%rcx), %edx
     movl %edx, (%rax)
.LBB0_4: # %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i
     testl %ecx, %ecx
     jg .LBB0_6
     leaq 24(%rsp), %rsi
     callq std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep::_M_destroy(std::allocator<char> const&)
     jmp .LBB0_6

.L.str:
     .asciz "hello"

and

#include <vector>
#include <numeric>

typedef std::vector<int> container_t;

int main()
{
   const container_t a{1,2};
   const container_t b{4,5};
   const container_t ints
   {
     std::accumulate(a.begin(),a.end(),1),
     std::accumulate(b.begin(),b.end(),2),
   };
   return std::accumulate(ints.begin(),ints.end(),100);
}

results in:

main: # @main
     pushq %rbp
     pushq %r15
     pushq %r14
     pushq %rbx
     pushq %rax
     movl $8, %edi
     callq operator new(unsigned long)
     movq %rax, %r14
     movabsq $8589934593, %rax # imm = 0x200000001
     movq %rax, (%r14)
     movl $8, %edi
     callq operator new(unsigned long)
     movq %rax, %rbx
     movabsq $21474836484, %rax # imm = 0x500000004
     movq %rax, (%rbx)
     movl (%r14), %r15d
     movl 4(%r14), %ebp
     movl $8, %edi
     callq operator new(unsigned long)
     leal 1(%r15,%rbp), %ebp
     testq %rax, %rax
     movl %ebp, (%rax)
     movl $11, 4(%rax)
     je .LBB0_5
     movq %rax, %rdi
     callq operator delete(void*)
.LBB0_5: # %_ZNSt6vectorIiSaIiEED2Ev.exit25
     testq %rbx, %rbx
     je .LBB0_7
     movq %rbx, %rdi
     callq operator delete(void*)
.LBB0_7: # %_ZNSt6vectorIiSaIiEED2Ev.exit23
     addl $111, %ebp
     testq %r14, %r14
     je .LBB0_9
     movq %r14, %rdi
     callq operator delete(void*)
.LBB0_9: # %_ZNSt6vectorIiSaIiEED2Ev.exit21
     movl %ebp, %eax
     addq $8, %rsp
     popq %rbx
     popq %r14
     popq %r15
     popq %rbp
     ret
     movq %rax, %rbp
     movq %rbp, %rdi
     callq _Unwind_Resume
     movq %rax, %rbp
     jmp .LBB0_14
     movq %rax, %rbp
     testq %rbx, %rbx
     je .LBB0_14
     movq %rbx, %rdi
     callq operator delete(void*)
.LBB0_14: # %_ZNSt6vectorIiSaIiEED2Ev.exit15
     testq %r14, %r14
     je .LBB0_16
     movq %r14, %rdi
     callq operator delete(void*)
.LBB0_16: # %_ZNSt6vectorIiSaIiEED2Ev.exit
     movq %rbp, %rdi
     callq _Unwind_Resume
GCC_except_table0:
     .byte 255 # @LPStart Encoding = omit
     .byte 3 # @TType Encoding = udata4
     .asciz "\266\200\200" # @TType base offset
     .byte 3 # Call site Encoding = udata4
     .byte 52 # Call site table length
     .long .Lset0
     .long .Lset1
     .long .Lset2
     .byte 0 # On action: cleanup
     .long .Lset3
     .long .Lset4
     .long .Lset5
     .byte 0 # On action: cleanup
     .long .Lset6
     .long .Lset7
     .long .Lset8
     .byte 0 # On action: cleanup
     .long .Lset9
     .long .Lset10
     .long 0 # has no landing pad
     .byte 0 # On action: cleanup

>The standard (as of only a few months ago) allows operator new/operator

>delete calls such as the above to be removed, but*only* if they came
from

>new-expressions and delete-expressions. Under the covers, std::vector
uses
>direct calls to ::operator new and ::operator delete, so these
provisions
>do not apply.
>
>So... we should add some way for std::allocator to say
>"allocate/deallocate memory like a new-expression", and use it in
libc++. I
>think we should add
>
> void *__builtin_operator_new(size_t)
> void __builtin_operator_delete(void*)
>
>for this.
>

Clang now supports these, and I have a patch out for review to make libc++
use them. With the patch applied, clang+libc++ optimizes your code down to
just 'return 160' for both the std::array and std::vector cases.

unbelievably fast integration :slight_smile: - thanks alot for taking the time

how well does your patch play with

gcc.http://gcc.godbolt.org, clang version 3.4.1 -O3 -std=c++11

#include <string>
int main()
{
  return std::string("hello").size();
}

results in:

main: # @main
        movl $5, %eax
        retq

main: # @main

    pushq %rbx
    subq $32, %rsp
    leaq 16(%rsp), %rdi
    leaq 8(%rsp), %rdx
    movl $.L.str, %esi
    callq std::basic_string<char, std::char_traits<char>,
std::allocator<char> >::basic_string(char const*, std::allocator<char>
const&)
    movq 16(%rsp), %rax
    leaq -24(%rax), %rdi
    movl std::basic_string<char, std::char_traits<char>,
std::allocator<char> >::_Rep::_S_empty_rep_storage, %ecx
    cmpq %rcx, %rdi
    movl -24(%rax), %ebx
    jne .LBB0_1
.LBB0_6: # %_ZNSsD1Ev.exit
    movl %ebx, %eax
    addq $32, %rsp
    popq %rbx
    ret
.LBB0_1:
    addq $-8, %rax
    movl $__pthread_key_create, %ecx
    testq %rcx, %rcx
    je .LBB0_3
    movl $-1, %ecx
    lock
    xaddl %ecx, (%rax)
    movl %ecx, 28(%rsp)
    movl 28(%rsp), %ecx
    jmp .LBB0_4
.LBB0_3:
    movl (%rax), %ecx
    leal -1(%rcx), %edx
    movl %edx, (%rax)
.LBB0_4: # %_ZN9__gnu_cxxL27__exchange_
and_add_dispatchEPii.exit.i.i.i
    testl %ecx, %ecx
    jg .LBB0_6
    leaq 24(%rsp), %rsi
    callq std::basic_string<char, std::char_traits<char>,
std::allocator<char> >::_Rep::_M_destroy(std::allocator<char> const&)
    jmp .LBB0_6

.L.str:
    .asciz "hello"

and

#include <vector>
#include <numeric>

typedef std::vector<int> container_t;

int main()
{

  const container_t a{1,2};
  const container_t b{4,5};
  const container_t ints
  {
    std::accumulate(a.begin(),a.end(),1),
    std::accumulate(b.begin(),b.end(),2),
  };
  return std::accumulate(ints.begin(),ints.end(),100);
}

results in:

main: # @main
        movl $115, %eax
        retq

so the introduction of

void *__builtin_operator_new(size_t)
void __builtin_operator_delete(void*)

in clang and your patching of libc++ reduces my testszenarios results
down to the absolutely bare minimum
it can't get better - thank you very much

its a ~small~ optimization - but i think the impact could be very huge for libc++ using code bases

can you post the patch/review link?

are there still known corner cases / similar examples that do not profit from your patch?

dennis

and why is then clang 3.4.1
even without Richard Smith latest patch with __builtin_operator_new/delete
allowed to remove this new?

#include <memory>

int main()
{
   return *new int(10);
}

clang 3.4.1, -O3 -std=c++11

main: # @main
     movl $10, %eax
     ret

gcc 4.9.0 20130909, -O3 -std=c++11

main:
     subq $8, %rsp
     movl $4, %edi
     call operator new(unsigned long)
     movl $10, (%rax)
     movl $10, %eax
     addq $8, %rsp
     ret

GCC's codegen could be improved. However, Clang's appears to be a little
be too eager and potentially non-legit. E.g. it is legit if that
translation unit is the only unit of the entire program. However, it isn't
if that translation unit was linked with another unit providing a
replacement for 'operator new'. This is something that LTO could handle
very well.

-- Gaby

what i do not understand is why an operator new call insider of libc++
isn't remove
and such super small examples are handled (maybe too eager) - are there
different analyse routes
in clang for removing operator new - why - shouldn't that be the same code?

That's using the new operator, which can be optimized away even when
operator new cannot be. (The new operator will use operator new if
needed, but it may not need to.)

-- James