Optimization missed: reorder operations.

Hi
I am spotting this code is not optimized

typedef float float4 attribute((ext_vector_type(4)));

float4 f(float4 RGBA)
{
return RGBA.yzxw+RGBA.xxyw*3.4f;
}

float4 g(int x , float z )
{
float4 W;
W.x=W.y=W.z=W.w=x+1;
for(int i=0;i<10;i++)
{
W=f(-W);
}
return W;
}

LLVM misses the posibility to reorder instructions and group constants. ( optimization is not done as well with float instead of float4)

Is this suppose to happen? maximum optimization could be constant*vector+vector in this case.

when I compile with
clang temp.cpp -S -emit-llvm -O3 -ffast-math

I get this code.

define <4 x float> @_Z1gif(i32 %x, float %z) #0 {
entry:
%add = add nsw i32 %x, 1
%conv = sitofp i32 %add to float
%0 = insertelement <4 x float> undef, float %conv, i32 3
%1 = insertelement <4 x float> %0, float %conv, i32 2
%2 = insertelement <4 x float> %1, float %conv, i32 1
%3 = insertelement <4 x float> %2, float %conv, i32 0
%sub = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %3
%4 = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%5 = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i = fmul fast <4 x float> %5, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i = fadd fast <4 x float> %4, %mul.i
%sub.1 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i
%6 = shufflevector <4 x float> %sub.1, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%7 = shufflevector <4 x float> %sub.1, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.1 = fmul fast <4 x float> %7, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.1 = fadd fast <4 x float> %6, %mul.i.1
%sub.2 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i.1
%8 = shufflevector <4 x float> %sub.2, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%9 = shufflevector <4 x float> %sub.2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.2 = fmul fast <4 x float> %9, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.2 = fadd fast <4 x float> %8, %mul.i.2
%sub.3 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i.2
%10 = shufflevector <4 x float> %sub.3, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%11 = shufflevector <4 x float> %sub.3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.3 = fmul fast <4 x float> %11, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.3 = fadd fast <4 x float> %10, %mul.i.3
%sub.4 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i.3
%12 = shufflevector <4 x float> %sub.4, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%13 = shufflevector <4 x float> %sub.4, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.4 = fmul fast <4 x float> %13, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.4 = fadd fast <4 x float> %12, %mul.i.4
%sub.5 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i.4
%14 = shufflevector <4 x float> %sub.5, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%15 = shufflevector <4 x float> %sub.5, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.5 = fmul fast <4 x float> %15, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.5 = fadd fast <4 x float> %14, %mul.i.5
%sub.6 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i.5
%16 = shufflevector <4 x float> %sub.6, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%17 = shufflevector <4 x float> %sub.6, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.6 = fmul fast <4 x float> %17, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.6 = fadd fast <4 x float> %16, %mul.i.6
%sub.7 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i.6
%18 = shufflevector <4 x float> %sub.7, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%19 = shufflevector <4 x float> %sub.7, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.7 = fmul fast <4 x float> %19, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.7 = fadd fast <4 x float> %18, %mul.i.7
%sub.8 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i.7
%20 = shufflevector <4 x float> %sub.8, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%21 = shufflevector <4 x float> %sub.8, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.8 = fmul fast <4 x float> %21, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.8 = fadd fast <4 x float> %20, %mul.i.8
%sub.9 = fsub fast <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %add.i.8
%22 = shufflevector <4 x float> %sub.9, <4 x float> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 3>
%23 = shufflevector <4 x float> %sub.9, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 1, i32 3>
%mul.i.9 = fmul fast <4 x float> %23, <float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000, float 0x400B333340000000>
%add.i.9 = fadd fast <4 x float> %22, %mul.i.9
ret <4 x float> %add.i.9
}