How to unroll reduction loop with caching accumulator on register?

Dear all,

Attached notunrolled.ll is a module containing reduction kernel. What I’m trying to do is to unroll it in such way, that partial reduction on unrolled iterations would be performed on register, and then stored to memory only once. Currently llvm’s unroller together with all standard optimizations produce code, which stores value to memory after every unrolled iteration, which is much less efficient. Do you have an idea which combination of opt passes may help to cache unrolled loop stores on a register?

Many thanks,

  • D.

notunrolled.ll (2.4 KB)

unrolled.ll (6.46 KB)

I tried to manually assign each of 3 arrays a unique TBAA node. But it does not seem to help: alias analysis still considers arrays as may-alias, which most likely prevents the desired optimization. Below is the sample code with TBAA metadata inserted. Could you please suggest what might be wrong with it?

Many thanks,

  • D.

marcusmae@M17xR4:~/forge/llvm$ opt -time-passes -enable-tbaa -tbaa -print-alias-sets -O3 check.ll -o - -S
Alias Set Tracker: 1 alias sets for 3 pointer values.
AliasSet[0x39046c0, 3] may alias, Mod/Ref Pointers: (float* inttoptr (i64 47380979712 to float*), 4), (float* %p_newGEPInst9.cloned, 4), (float* %p_newGEPInst12.cloned, 4)

; ModuleID = ‘check.ll’
target datalayout = “e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64”
target triple = “nvptx64-unknown-unknown”

@__kernelgen_version = constant [15 x i8] c"0.2/1654:1675M\00"

define ptx_kernel void @__kernelgen_matvec_loop_7(i32* nocapture) #0 {
“Loop Function Root”:
%tid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%ctaid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%PositionOfBlockInGrid.x = shl i32 %ctaid.x, 9
%BlockLB.Add.ThreadPosInBlock.x = add i32 %PositionOfBlockInGrid.x, %tid.x
%isThreadLBgtLoopUB.x = icmp sgt i32 %BlockLB.Add.ThreadPosInBlock.x, 65535
br i1 %isThreadLBgtLoopUB.x, label %CUDA.AfterLoop.x, label %CUDA.LoopHeader.x.preheader

CUDA.LoopHeader.x.preheader: ; preds = %“Loop Function Root”
%1 = sext i32 %BlockLB.Add.ThreadPosInBlock.x to i64
store float 0.000000e+00, float* inttoptr (i64 47380979712 to float*), align 8192, !tbaa !0
%p_.moved.to.4.cloned = shl nsw i64 %1, 9
br label %polly.loop_body

CUDA.AfterLoop.x: ; preds = %polly.loop_body, %“Loop Function Root”
ret void

polly.loop_body: ; preds = %polly.loop_body, %CUDA.LoopHeader.x.preheader
%p_scalar = phi float [ 0.000000e+00, %CUDA.LoopHeader.x.preheader ], [ %p_8, %polly.loop_body ]
%polly.loopiv10 = phi i64 [ 0, %CUDA.LoopHeader.x.preheader ], [ %polly.next_loopiv, %polly.loop_body ]
%polly.next_loopiv = add i64 %polly.loopiv10, 1
%p_ = add i64 %polly.loopiv10, %p_.moved.to.4.cloned
%p_newGEPInst9.cloned = getelementptr float* inttoptr (i64 47246749696 to float*), i64 %p_
%p_newGEPInst12.cloned = getelementptr float* inttoptr (i64 47380971520 to float*), i64 %polly.loopiv10
%_p_scalar_5 = load float* %p_newGEPInst9.cloned, align 4, !tbaa !1
%_p_scalar_6 = load float* %p_newGEPInst12.cloned, align 4, !tbaa !2
%p_7 = fmul float %_p_scalar_5, %_p_scalar_6
%p_8 = fadd float %p_scalar, %p_7
store float %p_8, float* inttoptr (i64 47380979712 to float*), align 8192, !tbaa !0
%exitcond = icmp eq i64 %polly.next_loopiv, 512
br i1 %exitcond, label %CUDA.AfterLoop.x, label %polly.loop_body
}

declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1

declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1

attributes #0 = { alwaysinline nounwind }
attributes #1 = { nounwind readnone }

!0 = metadata !{metadata !“output”, null}
!1 = metadata !{metadata !“input1”, null}
!2 = metadata !{metadata !“input2”, null}

Just for record, here’s what I was doing wrong.

!0 = metadata !{metadata !“output”, null}
!1 = metadata !{metadata !“input1”, null}
!2 = metadata !{metadata !“input2”, null}

should be

!0 = metadata !{ }
!1 = metadata !{ metadata !“output”, metadata !0 }
!2 = metadata !{ metadata !“input1”, metadata !0 }
!3 = metadata !{ metadata !“input2”, metadata !0 }

with the corresponding renaming of nodes.

With this metadata, opt -O3 successfully pull store out of the loop:

; ModuleID = ‘check.ll’
target datalayout = “e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64”
target triple = “nvptx64-unknown-unknown”

@__kernelgen_version = constant [15 x i8] c"0.2/1654:1675M\00"

define ptx_kernel void @__kernelgen_matvec_loop_7(i32* nocapture) nounwind alwaysinline {
“Loop Function Root”:
%tid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%ctaid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%PositionOfBlockInGrid.x = shl i32 %ctaid.x, 9
%BlockLB.Add.ThreadPosInBlock.x = add i32 %PositionOfBlockInGrid.x, %tid.x
%isThreadLBgtLoopUB.x = icmp sgt i32 %BlockLB.Add.ThreadPosInBlock.x, 65535
br i1 %isThreadLBgtLoopUB.x, label %CUDA.AfterLoop.x, label %CUDA.LoopHeader.x.preheader

CUDA.LoopHeader.x.preheader: ; preds = %“Loop Function Root”
%1 = sext i32 %BlockLB.Add.ThreadPosInBlock.x to i64
store float 0.000000e+00, float* inttoptr (i64 47380979712 to float*), align 8192, !tbaa !0
%p_.moved.to.4.cloned = shl nsw i64 %1, 9
br label %polly.loop_body

CUDA.AfterLoop.x.loopexit: ; preds = %polly.loop_body
store float %p_8, float* inttoptr (i64 47380979712 to float*), align 8192
br label %CUDA.AfterLoop.x

CUDA.AfterLoop.x: ; preds = %CUDA.AfterLoop.x.loopexit, %“Loop Function Root”
ret void

polly.loop_body: ; preds = %polly.loop_body, %CUDA.LoopHeader.x.preheader
%p_scalar = phi float [ 0.000000e+00, %CUDA.LoopHeader.x.preheader ], [ %p_8, %polly.loop_body ]
%polly.loopiv10 = phi i64 [ 0, %CUDA.LoopHeader.x.preheader ], [ %polly.next_loopiv, %polly.loop_body ]
%polly.next_loopiv = add i64 %polly.loopiv10, 1
%p_ = add i64 %polly.loopiv10, %p_.moved.to.4.cloned
%p_newGEPInst9.cloned = getelementptr float* inttoptr (i64 47246749696 to float*), i64 %p_
%p_newGEPInst12.cloned = getelementptr float* inttoptr (i64 47380971520 to float*), i64 %polly.loopiv10
%_p_scalar_5 = load float* %p_newGEPInst9.cloned, align 4, !tbaa !2
%_p_scalar_6 = load float* %p_newGEPInst12.cloned, align 4, !tbaa !3
%p_7 = fmul float %_p_scalar_5, %_p_scalar_6
%p_8 = fadd float %p_scalar, %p_7
%exitcond = icmp eq i64 %polly.next_loopiv, 512
br i1 %exitcond, label %CUDA.AfterLoop.x.loopexit, label %polly.loop_body
}

declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone

declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone

!0 = metadata !{metadata !“output”, metadata !1}
!1 = metadata !{}
!2 = metadata !{metadata !“input1”, metadata !1}
!3 = metadata !{metadata !“input2”, metadata !1}

2013/3/11 Dmitry Mikushin <dmitry@kernelgen.org>