How to avoid register spills at wide integer addition?

Hi, all
I want to make a function such as

void add512(uint512_t *pz, const uint512_t *px, const uint512_t *py)
  *pz = *px + *py;

# uint512_t means 512-bit unsigned integer register.

Then, I make a sample code:

cat t.ll

define void @add512(i512*noalias %pz, i512*noalias %px, i512*noalias %py)
  %x = load i512* %px
  %y = load i512* %py
  %z = add i512 %x, %y
  store i512 %z, i512* %pz
  ret void

llc-3.6 -O3 -march=x86 t.ll -o -

        pushl %ebp
        pushl %ebx
        pushl %edi
        pushl %esi
        subl $56, %esp
        movl 84(%esp), %eax
        movl 80(%esp), %edi
        movl 8(%edi), %esi
        movl (%edi), %edx
        movl 4(%edi), %ebx
        movl 60(%eax), %ecx
        movl %ecx, 52(%esp) # 4-byte Spill
        movl 56(%eax), %ecx
        movl %ecx, 48(%esp) # 4-byte Spill
        addl (%eax), %edx
        movl %edx, 24(%esp) # 4-byte Spill
        adcl 4(%eax), %ebx
        movl %ebx, 20(%esp) # 4-byte Spill
        adcl 8(%eax), %esi
        movl %esi, 16(%esp) # 4-byte Spill
        movl 52(%eax), %ecx
        movl %ecx, 40(%esp) # 4-byte Spill
        movl 48(%eax), %ecx
        movl %ecx, 36(%esp) # 4-byte Spill

I expect a code such as

    pushl %ebx
    movl 8(%esp), %ebx ; pz
    movl 12(%esp), %ecx ; px
    movl 16(%esp), %edx ; py
    movl (%ecx), %eax
    addl (%edx), %eax
    movl %eax, (%ebx)
    movl 4(%ecx), %eax
    adcl 4(%edx), %eax
    movl %eax, 4(%ebx)
    movl 8(%ecx), %eax
    adcl 8(%edx), %eax
    movl %eax, 8(%ebx)

How can I let llvm to assign *%pz for work area of %z?

For example,

void twice512(uint512_t *pz, const uint512_t *px)
  *pz = *px + *px;
generates good code without spills.