Excessive register spilling in large automatically generated functions, such as is found in FFTW

Hi,

I've noticed that LLVM tends to generate suboptimal code and spill an
excessive amount of registers in large functions, such as in those
that are automatically generated by FFTW.

LLVM generates good code for a function that computes an 8-point
complex FFT, but from 16-point upwards, icc or gcc generates much
better code. Here is an example of a sequence of instructions from a
32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE:

        [...]
  movaps 32(%rdi), %xmm3
  movaps 48(%rdi), %xmm2
  movaps %xmm3, %xmm1 ### <-- xmm3 mov'ed into xmm1
  movaps %xmm3, %xmm4 ### <-- xmm3 mov'ed into xmm4
  addps %xmm0, %xmm1
  movaps %xmm1, -16(%rbp) ## 16-byte Spill
  movaps 144(%rdi), %xmm3 ### <-- new data mov'ed into xmm3
        [...]

xmm3 loaded, duplicated into 2 registers, and then discarded as other
data is loaded into it. Can anyone shed some light on why this might
be happening?

Below is the code for a 32-point FFT that generated the above
instructions. Sorry I can't supply a smaller example, but this only
begins to happen with functions of this size.

Regards,
Anthony

////////////////////// CODE /////////////////////////////////////////

#include <xmmintrin.h>

#define __INLINE static inline __attribute__((always_inline))
#define LOAD _mm_load_ps
#define STORE _mm_store_ps
#define ADD _mm_add_ps
#define SUB _mm_sub_ps
#define MULT _mm_mul_ps
#define STREAM _mm_stream_ps
#define SHUF _mm_shuffle_ps
#define VLIT4(a,b,c,d) _mm_set_ps(a,b,c,d)
#define SWAP(d) SHUF(d,d,_MM_SHUFFLE(2,3,0,1))
#define UNPACK2LO(a,b) SHUF(a,b,_MM_SHUFFLE(1,0,1,0))
#define UNPACK2HI(a,b) SHUF(a,b,_MM_SHUFFLE(3,2,3,2))
#define HALFBLEND(a,b) SHUF(a,b,_MM_SHUFFLE(3,2,1,0))

__INLINE void TX2(__m128 *a, __m128 *b) {
    __m128 TX2_t0 = UNPACK2LO(*a, *b);
    __m128 TX2_t1 = UNPACK2HI(*a,*b);
    *a = TX2_t0; *b = TX2_t1;
}
__INLINE void FMA(__m128 *Rd, __m128 Rn, __m128 Rm) { *Rd = ADD(*Rd,
MULT(Rn,Rm)); }
__INLINE void FMS(__m128 *Rd, __m128 Rn, __m128 Rm) { *Rd = SUB(*Rd,
MULT(Rn,Rm)); }
__INLINE __m128 SIGNIFY(__m128 a) {
  __m128 t1;
  t1 = _mm_xor_ps(a, _mm_set_ps(-0.0f, 0.0f, -0.0f, 0.0f));
  return t1;
}
__INLINE __m128 MULI(__m128 a) { return SWAP(SIGNIFY(a)); }
__INLINE __m128 MUL(__m128 d, __m128 re, __m128 im) {
  re = MULT(re, d);
  FMS(&re, im, SWAP(d));
  return re;
}
__INLINE __m128 MULJ(__m128 d, __m128 re, __m128 im) {
   re = MULT(re, d);
   FMA(&re, im, SWAP(d));
  return re;
}
__INLINE void S_4(__m128 r0, __m128 r1, __m128 r2, __m128 r3, float
*o0, float *o1, float *o2, float *o3) { STORE(o0, r0); STORE(o1, r1);
STORE(o2, r2); STORE(o3, r3); }
__INLINE void K_0(__m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 uk, uk2, zk, zk_d;
  uk = *r0;
  uk2 = *r1;
  zk = ADD(*r2, *r3);
  zk_d = MULI(SUB(*r2, *r3));
  *r0 = ADD(uk, zk);
  *r2 = SUB(uk, zk);
  *r1 = SUB(uk2, zk_d);
  *r3 = ADD(uk2, zk_d);
}
__INLINE void K_N(__m128 re, __m128 im, __m128 *r0, __m128 *r1, __m128
*r2, __m128 *r3) {
  __m128 uk, uk2, zk_p, zk_n, zk, zk_d;

  uk = *r0;
  uk2 = *r1;
  zk_p = MUL(*r2, re, im);
  zk_n = MULJ(*r3, re, im);
  zk = ADD(zk_p, zk_n);
  zk_d = MULI(SUB(zk_p, zk_n));

  *r2 = SUB(uk, zk);
  *r0 = ADD(uk, zk);
  *r3 = ADD(uk2, zk_d);
  *r1 = SUB(uk2, zk_d);
}

__INLINE void L_4_4(const float *i0, const float *i1, const float *i2,
const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t2 = LOAD(i2);
  t3 = LOAD(i3);
  t4 = ADD(t0, t1);
  t5 = SUB(t0, t1);
  t6 = ADD(t2, t3);
  t7 = MULI(SUB(t2, t3));
  t0 = ADD(t4, t6);
  t2 = SUB(t4, t6);
  t1 = SUB(t5, t7);
  t3 = ADD(t5, t7);
    TX2(&t0,&t1);
    TX2(&t2,&t3);
    *r0 = t0;*r2 = t1;*r1 = t2;*r3 = t3;
}
__INLINE void L_2_2(const float *i0, const float *i1, const float *i2,
const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t2 = LOAD(i2);
  t3 = LOAD(i3);

    t4 = ADD(t0, t1);
    t5 = SUB(t0, t1);
    t6 = ADD(t2, t3);
    t7 = SUB(t2, t3);
    TX2(&t4,&t5);
    TX2(&t6,&t7);
    *r0 = t4; *r2 = t5; *r1 = t6; *r3 = t7;

}
__INLINE void L_4_2(const float *i0, const float *i1, const float *i2,
const float *i3, __m128 *r0, __m128 *r1, __m128 *r2, __m128 *r3) {
  __m128 t0, t1, t2, t3, t4, t5, t6, t7;
  t0 = LOAD(i0);
  t1 = LOAD(i1);
  t6 = LOAD(i2);
  t7 = LOAD(i3);
  t2 = HALFBLEND(t6, t7);
  t3 = HALFBLEND(t7, t6);

t4 = ADD(t0, t1);
  t5 = SUB(t0, t1);
  t6 = ADD(t2, t3);
  t7 = SUB(t2, t3);
  *r2 = UNPACK2HI(t4, t5);
  *r3 = UNPACK2HI(t6, t7);
   t7 = MULI(t7);
  t0 = ADD(t4, t6);
  t2 = SUB(t4, t6);
  t1 = SUB(t5, t7);
  t3 = ADD(t5, t7);
  *r0 = UNPACK2LO(t0, t1);
  *r1 = UNPACK2LO(t2, t3);
}

void fft32(const float *in, float *out) {
  __m128 r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15,r16_17,r18_19,r20_21,r22_23,r24_25,r26_27,r28_29,r30_31;

  L_4_4(in+0,in+32,in+16,in+48,&r0_1,&r2_3,&r16_17,&r18_19);
  L_2_2(in+8,in+40,in+56,in+24,&r4_5,&r6_7,&r20_21,&r22_23);
  K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r0_1,&r2_3,&r4_5,&r6_7);
  L_4_2(in+4,in+36,in+20,in+52,&r8_9,&r10_11,&r28_29,&r30_31);
  L_4_4(in+60,in+28,in+12,in+44,&r12_13,&r14_15,&r24_25,&r26_27);
  K_N(VLIT4(0.9238,0.9238,1,1),VLIT4(0.3826,-0.3826,0,-0),&r0_1,&r4_5,&r8_9,&r12_13);
  K_N(VLIT4(0.3826,0.3826,0.7071,0.7071),VLIT4(0.9238,-0.9238,0.7071,-0.7071),&r2_3,&r6_7,&r10_11,&r14_15);
  K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r16_17,&r18_19,&r20_21,&r22_23);
  K_N(VLIT4(0.7071,0.7071,1,1),VLIT4(0.7071,-0.7071,0,-0),&r24_25,&r26_27,&r28_29,&r30_31);
  K_N(VLIT4(0.9807,0.9807,1,1),VLIT4(0.1950,-0.1950,0,-0),&r0_1,&r8_9,&r16_17,&r24_25);
  S_4(r0_1,r8_9,r16_17,r24_25,out+0,out+16,out+32,out+48);
  K_N(VLIT4(0.8314,0.8314,0.9238,0.9238),VLIT4(0.5555,-0.5555,0.3826,-0.3826),&r2_3,&r10_11,&r18_19,&r26_27);
  S_4(r2_3,r10_11,r18_19,r26_27,out+4,out+20,out+36,out+52);
  K_N(VLIT4(0.5555,0.5555,0.7071,0.7071),VLIT4(0.8314,-0.8314,0.7071,-0.7071),&r4_5,&r12_13,&r20_21,&r28_29);
  S_4(r4_5,r12_13,r20_21,r28_29,out+8,out+24,out+40,out+56);
  K_N(VLIT4(0.1950,0.1950,0.3826,0.3826),VLIT4(0.9807,-0.9807,0.9238,-0.9238),&r6_7,&r14_15,&r22_23,&r30_31);
  S_4(r6_7,r14_15,r22_23,r30_31,out+12,out+28,out+44,out+60);

}

I've noticed that LLVM tends to generate suboptimal code and spill an
excessive amount of registers in large functions, such as in those
that are automatically generated by FFTW.

One problem might be that we're forcing the 16 stores to the out array to happen in source order, which constrains the schedule. The stores are clearly non-aliasing.

LLVM generates good code for a function that computes an 8-point
complex FFT, but from 16-point upwards, icc or gcc generates much
better code. Here is an example of a sequence of instructions from a
32-point FFT, compiled with clang/LLVM 3.1 for x86_64 with SSE:

       [...]
  movaps 32(%rdi), %xmm3
  movaps 48(%rdi), %xmm2
  movaps %xmm3, %xmm1 ### <-- xmm3 mov'ed into xmm1
  movaps %xmm3, %xmm4 ### <-- xmm3 mov'ed into xmm4
  addps %xmm0, %xmm1
  movaps %xmm1, -16(%rbp) ## 16-byte Spill
  movaps 144(%rdi), %xmm3 ### <-- new data mov'ed into xmm3
       [...]

xmm3 loaded, duplicated into 2 registers, and then discarded as other
data is loaded into it. Can anyone shed some light on why this might
be happening?

I'm not actually seeing this behavior on trunk.

/jakob

I've just tried trunk, and although behavior like above isn't
immediately obvious, trunk generates more instructions and spills more
registers compared to 3.1.

amb

Actually, here is an occurrence of that behavior when compiling the
code with trunk:

        [...]
        movaps %xmm1, %xmm0 ### xmm1 mov'ed to xmm0
  movaps %xmm1, %xmm14 ### xmm1 mov'ed to xmm14
  addps %xmm7, %xmm0
  movaps %xmm7, %xmm13
  movaps %xmm0, %xmm1 ### and now other data is mov'ed into xmm1,
making one of the above movaps superfluous
        [...]

amb

As well as many occurrences in the above form, a similar form appears:

        [...]
        movaps %xmm5, %xmm7
  movaps %xmm7, %xmm3
  movaps -96(%rsp), %xmm0 ## 16-byte Reload
  subps %xmm0, %xmm3
  addps %xmm0, %xmm7
  movaps 240(%rsp), %xmm0 ## 16-byte Reload
  movaps -128(%rsp), %xmm1 ## 16-byte Reload
  movlhps %xmm0, %xmm1 ## xmm1 = xmm1[0],xmm0[0]
  movaps %xmm8, %xmm4
  movaps 160(%rsp), %xmm5 ## 16-byte Reload
        [...]

Here the problem manifests with xmm3, 5 and 7, but in contrast to the
above case, there is now data dependence in the first pair of
instructions.

amb