better code for IV

Hi,

I will be glad if you can give me some advice on how to make LLVM generate better code for IV in a simple loop case. There is IR sample below, I know of one way to do it, and I’m looking for advice on another way.

When compiling a simple C loop (c[i]=a[i]+b[i], a, b, c are float*), the IR starts as follows:

define void @ArrayAdd1(float* nocapture %a, float* nocapture %b, float* nocapture %c, i64 %iNumElements) {
Entry:
br label %L_pre_head

L_pre_head: ; preds = %Entry
br label %L_entry

L_entry: ; preds = %L_entry, %L_pre_head
%L_ind_var = phi i64 [ 0, %L_pre_head ], [ %L_inc_ind_var, %L_entry ]
%L_tid = phi i64 [ 0, %L_pre_head ], [ %L_inc_tid, %L_entry ]
%trunc = trunc i64 %L_tid to i32
%idxprom = sext i32 %trunc to i64
%arrayidx = getelementptr inbounds float* %a, i64 %idxprom
%0 = load float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float* %b, i64 %idxprom
%1 = load float* %arrayidx2, align 4
%add = fadd float %0, %1
%arrayidx4 = getelementptr inbounds float* %c, i64 %idxprom
store float %add, float* %arrayidx4, align 4
%L_inc_ind_var = add nuw nsw i64 %L_ind_var, 1
%L_cmp.to.max = icmp eq i64 %L_inc_ind_var, %iNumElements
%L_inc_tid = add nuw nsw i64 %L_tid, 1
br i1 %L_cmp.to.max, label %L_exit, label %L_entry

L_exit: ; preds = %L_entry
br label %2

; :2 ; preds = %L_exit
ret void
}

And after going through all passes before code generation it becomes:

define void @ArrayAdd1(float * nocapture %a, float * nocapture %b, float * nocapture %c, i64 %iNumElements)
{
Entry:
br label %L_pre_head

L_pre_head: ; preds = %Entry
br label %L_entry

L_entry: ; preds = %L_entry, %L_pre_head
%L_ind_var = phi i64 [ 0, %L_pre_head ], [ %L_inc_ind_var, %L_entry ]
%L_tid = phi i64 [ 0, %L_pre_head ], [ %L_inc_tid, %L_entry ]
%sext = shl i64 %L_tid, 32
%idxprom = ashr exact i64 %sext, 32
%arrayidx = getelementptr inbounds float * %a, i64 %idxprom
%0 = load float * %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float * %b, i64 %idxprom
%1 = load float * %arrayidx2, align 4
%add = fadd float %0, %1
%arrayidx4 = getelementptr inbounds float * %c, i64 %idxprom
store float %add, float * %arrayidx4, align 4
%L_inc_ind_var = add nuw nsw i64 %L_ind_var, 1
%L_cmp.to.max = icmp eq i64 %L_inc_ind_var, %iNumElements
%L_inc_tid = add nuw nsw i64 %L_tid, 1
br i1 %L_cmp.to.max, label %L_exit, label %L_entry

L_exit: ; preds = %L_entry
br label %2

; :2 ; preds = %L_exit
ret void
}

On the way to code gen the IR goes through some additional passes, among them the LSR in LoopStrengthReduce.cpp. This pass promotes the shl operation out of the loop. As a result the generated code is awkward and uses 4 registers for IV management instead of 2 (see transformed IR and result asm below).

One way to deal with this is to prevent the transformation of
%trunc = trunc i64 %L_tid to i32
%idxprom = sext i32 %trunc to i64
to
%sext = shl i64 %L_tid, 32
%idxprom = ashr exact i64 %sext, 32

Sext(trunk()) remains nice until code gen and one asm instruction of the sort “move %eax, %rax” is generated, as I want (see asm sample at the end of this message). I implemented this locally in InstCombineCasts.cpp, in InstCombiner::visitSExt(). The code looks for the specific pattern of sext32to64(trunk64to32()) and doesn’t let it become ashr(shl()).

Another way that I can think of, is to keep ashr32(shl32()) together and then it’s possible to replace both instructions with one asm instruction (a la “move %eax, %rax”) and avoid an additional register for monitoring end of loop. I’m not sure what is the right point in LSRInstance::LSRInstance to avoid this optimization.

Do you think that the first way is ok? If you think that this should be done in the second way or a third way I will appreciate your guidance.

Thanks, Anat

The IR after LSR:
define void @ArrayAdd1(float* nocapture %a, float* nocapture %b, float* nocapture %c, i64 %iNumElements) {
Entry:
br label %L_pre_head

L_pre_head: ; preds = %Entry
br label %L_entry

L_entry: ; preds = %L_entry, %L_pre_head
%lsr.iv1 = phi i64 [ %lsr.iv.next2, %L_entry ], [ 0, %L_pre_head ]
%lsr.iv = phi i64 [ %lsr.iv.next, %L_entry ], [ %iNumElements, %L_pre_head ]
%idxprom = ashr exact i64 %lsr.iv1, 32
%arrayidx = getelementptr inbounds float* %a, i64 %idxprom
%0 = load float* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds float* %b, i64 %idxprom
%1 = load float* %arrayidx2, align 4
%add = fadd float %0, %1
%arrayidx4 = getelementptr inbounds float* %c, i64 %idxprom
store float %add, float* %arrayidx4, align 4
%lsr.iv.next = add i64 %lsr.iv, -1
%lsr.iv.next2 = add i64 %lsr.iv1, 4294967296
%L_cmp.to.max = icmp eq i64 %lsr.iv.next, 0
br i1 %L_cmp.to.max, label %L_exit, label %L_entry

L_exit: ; preds = %L_entry
br label %2

; :2 ; preds = %L_exit
ret void
}

Asm code:
ArrayAdd1: # @ArrayAdd1
.cfi_startproc

BB#0: # %Entry

xorl %r9d, %r9d
movabsq $4294967296, %r8 # imm = 0x100000000
.align 16, 0x90
.LBB0_1: # %L_entry

=>This Inner Loop Header: Depth=1

movq %r9, %rax
sarq $32, %rax
movss (%rdi,%rax,4), %xmm0
addss (%rsi,%rax,4), %xmm0
movss %xmm0, (%rdx,%rax,4)
addq %r8, %r9
decq %rcx
jne .LBB0_1

BB#2:

Ret

This is what I want to get:
ArrayAdd2: # @ArrayAdd2
.cfi_startproc

BB#0: # %Entry

xorl %eax, %eax
.align 16, 0x90
.LBB1_1: # %L_entry

=>This Inner Loop Header: Depth=1

movslq %eax, %r8
movss (%rdi,%r8,4), %xmm0
addss (%rsi,%r8,4), %xmm0
movss %xmm0, (%rdx,%r8,4)
incq %rax
cmpq %rax, %rcx
jne .LBB1_1

BB#2:

Ret