Hi,
I want to use loop unrolling pass, however, I find that loop unrolling will introduces conditional branch at end of every “unrolled” part. For example, consider the following code
void foo( int n, int array_x[])
{
for (int i=0; i < n; i++)
array_x[i] = i;
}
Then I use this command “opt-3.5 try.bc -mem2reg -loops -loop-simplify -loop-rotate -lcssa -indvars -loop-unroll -unroll-count=3 -simplifycfg -S”, it gives me this IR:
define void @_Z3fooiPi(i32 %n, i32* %array_x) #0 {
%1 = icmp slt i32 0, %n
br i1 %1, label %.lr.ph, label %._crit_edge
.lr.ph: ; preds = %0, %7
%indvars.iv = phi i64 [ %indvars.iv.next.2, %7 ], [ 0, %0 ]
%2 = getelementptr inbounds i32* %array_x, i64 %indvars.iv
%3 = trunc i64 %indvars.iv to i32
store i32 %3, i32* %2
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp ne i32 %lftr.wideiv, %n
br i1 %exitcond, label %4, label %._crit_edge
._crit_edge: ; preds = %.lr.ph, %4, %7, %0
ret void
; :4 ; preds = %.lr.ph
%5 = getelementptr inbounds i32* %array_x, i64 %indvars.iv.next
%6 = trunc i64 %indvars.iv.next to i32
store i32 %6, i32* %5
%indvars.iv.next.1 = add nuw nsw i64 %indvars.iv.next, 1
%lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
%exitcond.1 = icmp ne i32 %lftr.wideiv.1, %n
br i1 %exitcond.1, label %7, label %._crit_edge
; :7 ; preds = %4
%8 = getelementptr inbounds i32* %array_x, i64 %indvars.iv.next.1
%9 = trunc i64 %indvars.iv.next.1 to i32
store i32 %9, i32* %8
%indvars.iv.next.2 = add nuw nsw i64 %indvars.iv.next.1, 1
%lftr.wideiv.2 = trunc i64 %indvars.iv.next.2 to i32
%exitcond.2 = icmp ne i32 %lftr.wideiv.2, %n
br i1 %exitcond.2, label %.lr.ph, label %._crit_edge
}
As you can see, at the end of BB 4 and BB7 there are “add”, “icmp” and “br” instrcutions to check the boundary. I understand this is for the correctness. However, I would expect the loop unrolling can change my code to something like this:
void foo( int n, int array_x[])
{
int j = n%3;
int m = n - j;
for (int i=0; i < m; i+=3){
array_x[i] = i;
array_x[i+1] = i+1;
array_x[i+2] = i+2;
}
for(i=m; i<n; i++)
array_x[i] = i;
}
In this case, the BB4 and BB7 will do not have the “add”, “icmp” and “br” instructions because these BBs can be merged together.
How can I achieve this? Thanks.
Regards,
Xiangyang