Clang generating unnecessary spills(?) when passing a struct argument by value

// Test case:

typedef struct { int a; int b; int c; int d; int e; int f; } T;

extern int foo(T a);

extern T blah;

int boo() {

return foo(blah);

}

// Generated code (Clang -O3) – trunk, 10.0.0 (https://godbolt.org/):

boo: # @boo

subq $56, %rsp

movq blah+16(%rip), %rax

movq %rax, 48(%rsp)

movups blah(%rip), %xmm0

movaps %xmm0, 32(%rsp)

movq 48(%rsp), %rax

movq %rax, 16(%rsp)

movaps 32(%rsp), %xmm0

movups %xmm0, (%rsp)

callq foo

addq $56, %rsp

retq

// Generated code (gcc -O3) – 7.3.0 (https://godbolt.org/):

boo:

subq $40, %rsp

movq blah+16(%rip), %rax

movdqu blah(%rip), %xmm0

movq %rax, 16(%rsp)

movups %xmm0, (%rsp)

call foo

addq $40, %rsp

ret

Why is clang generating all those instructions (which seem to be just unnecessary spills of the registers: %rax and %xmm0)?

Adding an 8-byte alignment to blah makes those two pairs of movq and movaps go away. But I don’t know why the alignment matters in this case.