LLVM/Clang optimization issue -- optimizer fails to discover common loop variable

Hi,

I’m having trouble getting LLVM/Clang to generate high quality code for a tight loop involving AVX intrinsics.
Consider the following simple function, which computes a sum of a bunch of buffers.

#include <immintrin.h>

void sum(size_t n,
__m256* restrict a, __m256* restrict b,
__m256* restrict c, __m256* restrict d,
__m256* restrict e, __m256* restrict f,
__m256* restrict g) {

for (size_t i = 0; i < n; ++i)
a[i] = _mm256_add_ps(
_mm256_add_ps(_mm256_add_ps(b[i], c[i]),
_mm256_add_ps(d[i], e[i])),
_mm256_add_ps(f[i], g[i]));
}

This is the main loop body resulting from the above expression (compiled with -mavx2 -O3 -fomit-frame-pointer -fno-unroll-loops) with LLVM version 7.3.0 (clang-703.0.31). Note the large number of “addq” instructions!
The compiler also seems to assume that the __m256 entries are unaligned, but that is another issue.

LBB0_2:

vmovups (%rdx), %ymm0
vaddps (%rcx), %ymm0, %ymm0
vmovups (%r8), %ymm1
vaddps (%r9), %ymm1, %ymm1
vaddps %ymm1, %ymm0, %ymm0
vmovups (%rax), %ymm1
vaddps (%r10), %ymm1, %ymm1
vaddps %ymm1, %ymm0, %ymm0
vmovups %ymm0, (%rsi)
addq $32, %rdx
addq $32, %rcx
addq $32, %r8
addq $32, %r9
addq $32, %rax
addq $32, %r10
addq $32, %rsi
decq %rdi
jne LBB0_2

The following output was generated by LLVM. It identifies the common counter variable and just stores the buffer offsets in registers.

L5:
vmovaps (%r9,%rax), %ymm1
vmovaps (%rcx,%rax), %ymm0
vaddps (%r8,%rax), %ymm1, %ymm2
vaddps (%rdx,%rax), %ymm0, %ymm0
vaddps %ymm0, %ymm2, %ymm1
vmovaps (%r11,%rax), %ymm0
vaddps (%rbx,%rax), %ymm0, %ymm0
vaddps %ymm0, %ymm1, %ymm0
vmovaps %ymm0, (%rsi,%rax)
addq $32, %rax
cmpq %rax, %r10
jne L5

Is there something that could be done to LLVM to generate better code in such cases?

For reference, this is the associated LLVM IR:

; Function Attrs: nounwind ssp uwtable
define void @Z3summPDv8_fS0_S0_S0_S0_S0_S0(i64 %n, <8 x float>* noalias nocapture %a, <8 x float>* noalias nocapture readonly %b, <8 x float>* noalias nocapture readonly %c, <8 x float>* noalias nocapture readonly %d, <8 x float>* noalias nocapture readonly %e, <8 x float>* noalias nocapture readonly %f, <8 x float>* noalias nocapture readonly %g) #0 {
%1 = icmp eq i64 %n, 0
br i1 %1, label %._crit_edge, label %.lr.ph

._crit_edge: ; preds = %.lr.ph, %0
ret void

.lr.ph: ; preds = %0, %.lr.ph
%i.01 = phi i64 [ %20, %.lr.ph ], [ 0, %0 ]
%2 = getelementptr inbounds <8 x float>, <8 x float>* %b, i64 %i.01
%3 = load <8 x float>, <8 x float>* %2, align 16, !tbaa !2
%4 = getelementptr inbounds <8 x float>, <8 x float>* %c, i64 %i.01
%5 = load <8 x float>, <8 x float>* %4, align 16, !tbaa !2
%6 = fadd <8 x float> %3, %5
%7 = getelementptr inbounds <8 x float>, <8 x float>* %d, i64 %i.01
%8 = load <8 x float>, <8 x float>* %7, align 16, !tbaa !2
%9 = getelementptr inbounds <8 x float>, <8 x float>* %e, i64 %i.01
%10 = load <8 x float>, <8 x float>* %9, align 16, !tbaa !2
%11 = fadd <8 x float> %8, %10
%12 = fadd <8 x float> %6, %11
%13 = getelementptr inbounds <8 x float>, <8 x float>* %f, i64 %i.01
%14 = load <8 x float>, <8 x float>* %13, align 16, !tbaa !2
%15 = getelementptr inbounds <8 x float>, <8 x float>* %g, i64 %i.01
%16 = load <8 x float>, <8 x float>* %15, align 16, !tbaa !2
%17 = fadd <8 x float> %14, %16
%18 = fadd <8 x float> %12, %17
%19 = getelementptr inbounds <8 x float>, <8 x float>* %a, i64 %i.01
store <8 x float> %18, <8 x float>* %19, align 16, !tbaa !2
%20 = add nuw i64 %i.01, 1
%exitcond = icmp eq i64 %20, %n
br i1 %exitcond, label %._crit_edge, label %.lr.ph
}

Thank you and best regards,
Wenzel