FYI, when the loop variables are declared as:
integer(8) i, j, k
There is no truncations/extensions interfering, but the subtracts stay in IR as late as loop strength reduction:
before:
; Loop:
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%offset.idx = add i64 %index, %103
%121 = sub i64 %offset.idx, %13
%122 = getelementptr float, ptr %110, i64 %121
%wide.load = load <4 x float>, ptr %122, align 4, !tbaa !3, !alias.scope !7, !noalias !10
%123 = getelementptr float, ptr %122, i64 4
%wide.load30 = load <4 x float>, ptr %123, align 4, !tbaa !3, !alias.scope !7, !noalias !10
%124 = sub i64 %offset.idx, %26
%125 = getelementptr float, ptr %113, i64 %124
%wide.load31 = load <4 x float>, ptr %125, align 4, !tbaa !3, !alias.scope !10
%126 = getelementptr float, ptr %125, i64 4
%wide.load32 = load <4 x float>, ptr %126, align 4, !tbaa !3, !alias.scope !10
%127 = fadd fast <4 x float> %wide.load31, %wide.load
%128 = fadd fast <4 x float> %wide.load32, %wide.load30
store <4 x float> %127, ptr %122, align 4, !tbaa !3, !alias.scope !7, !noalias !10
store <4 x float> %128, ptr %123, align 4, !tbaa !3, !alias.scope !7, !noalias !10
%index.next = add nuw i64 %index, 8
%129 = icmp eq i64 %index.next, %n.vec
br i1 %129, label %middle.block, label %vector.body, !llvm.loop !12
after:
; Loop:
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%145 = shl i64 %index, 2
%scevgep51 = getelementptr i8, ptr %scevgep49, i64 %145
%scevgep52 = getelementptr i8, ptr %scevgep51, i64 -16
%wide.load = load <4 x float>, ptr %scevgep52, align 4, !tbaa !3, !alias.scope !7, !noalias !10
%146 = shl i64 %index, 2
%scevgep50 = getelementptr i8, ptr %scevgep49, i64 %146
%wide.load30 = load <4 x float>, ptr %scevgep50, align 4, !tbaa !3, !alias.scope !7, !noalias !10
%147 = shl i64 %index, 2
%scevgep41 = getelementptr i8, ptr %scevgep40, i64 %147
%wide.load31 = load <4 x float>, ptr %scevgep41, align 4, !tbaa !3, !alias.scope !10
%scevgep42 = getelementptr i8, ptr %scevgep41, i64 16
%wide.load32 = load <4 x float>, ptr %scevgep42, align 4, !tbaa !3, !alias.scope !10
%148 = fadd fast <4 x float> %wide.load31, %wide.load
%149 = fadd fast <4 x float> %wide.load32, %wide.load30
store <4 x float> %148, ptr %scevgep52, align 4, !tbaa !3, !alias.scope !7, !noalias !10
store <4 x float> %149, ptr %scevgep50, align 4, !tbaa !3, !alias.scope !7, !noalias !10
%index.next = add nuw i64 %index, 8
%150 = icmp eq i64 %n.vec, %index.next
br i1 %150, label %middle.block, label %vector.body, !llvm.loop !12
But it is optimized away eventually, so it is not necessarily a good reproducer.
So the question is how to enable reassociation for the following IR (this is the cleanest what we can get after LICM):
81: ; preds = %.lr.ph, %81
%82 = phi i64 [ %72, %.lr.ph ], [ %93, %81 ]
%83 = phi i32 [ %67, %.lr.ph ], [ %92, %81 ]
%84 = sext i32 %83 to i64
%85 = sub nsw i64 %84, %13
%86 = getelementptr float, ptr %77, i64 %85
%87 = load float, ptr %86, align 4, !tbaa !3
%88 = sub nsw i64 %84, %33
%89 = getelementptr float, ptr %80, i64 %88
%90 = load float, ptr %89, align 4, !tbaa !3
%91 = fadd fast float %90, %87
store float %91, ptr %86, align 4, !tbaa !3
%92 = add i32 %83, 1
%93 = add nsw i64 %82, -1
%94 = icmp sgt i64 %93, 0
br i1 %94, label %81, label %._crit_edge