How can I remove these redundant copy between registers?

Hi,

I’ve been working on a Blackfin backend (llvm-3.6.0) based on the previous one that was removed in llvm-3.1.

llc generates codes like this:

29 p1 = r2;
30 r5 = [p1];
31 p1 = r2;
32 r6 = [p1 + 4];
33 r5 = r6 + r5;
34 r6 = [p0 + -4];
35 r5 *= r6;
36 p1 = r2;
37 r6 = [p1 + 8];
38 p1 = r2;

p1 and r2 are in different register classes.

A p* register can be used for load/stroe values from memory while a r* register can not.

As we can see, line 31, 36, 38 can be deleted. How can I configure llc to do this? Or do I have to write a custom pass to do this optimization? Any suggestion is welcome.

Thanks,

Huang

Hello Huang,

SIlly as this may sound, did you run OPT on the bitcode first before using LLC?

Cheers,

Sam

Hi Sam, Thanks for your helping.

I’ve never noticed OPT before, and I tried to run it on the bitcode, but still I get the code listed above.

FYI, I did as the following:
$ clang -c -m32 -O3 -emit-llvm ex11.c -o ex11.bc

$ opt -S -gvn ex11.bc > ex11.ll

$ llc -march=bfin ex11.ll

Is there any thing I’m missing?

And the following is how I did before:
$ clang -S -m32 -emit-llvm -O3 file.c -o file.ll
$ llc -march=bfin file.ll

Original C Source File:

1 typedef struct state {
2 int V[8][8];
3 int offset[8];
4 } state_t;
5
6 void foo(state_t
state, int ch, int *buffer)
7 {
8 int offset = state->offset[ch];
9
10 int idx, i;
11 for (i = 0, idx = 0; i < 100; i++, idx += 5) {
12 //long long tmp = 0;
13 int tmp = 0;
14 for (int j = 0; j < 2; j++) {
15 tmp += state->V[ch][offset[i]+2
j+0]buffer[idx + j];
16 tmp += state->V[ch][offset[i]+2
j+1]*buffer[idx + j];
17 }
18
19 // disable optimization
20 //volatile long long ret = tmp;
21 volatile int ret = tmp;
22 }
23 }

.ll file after run OPT on .bc file
; ModuleID = ‘ex11.bc’
target datalayout = “e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128”
target triple = “i386-apple-macosx10.10.0”

%struct.state = type { [8 x [8 x i32]], [8 x i32*] }

; Function Attrs: nounwind ssp
define void @foo(%struct.state* nocapture readonly %state, i32 %ch, i32* nocapture readonly %buffer) #0 {
entry:
%ret = alloca i32, align 4
%arrayidx = getelementptr inbounds %struct.state* %state, i32 0, i32 1, i32 %ch
%0 = load i32** %arrayidx, align 4, !tbaa !2
br label %for.cond3.preheader

for.cond3.preheader: ; preds = %for.cond3.preheader, %entry
%i.052 = phi i32 [ 0, %entry ], [ %inc27, %for.cond3.preheader ]
%idx.051 = phi i32 [ 0, %entry ], [ %add28, %for.cond3.preheader ]
%arrayidx6 = getelementptr inbounds i32* %0, i32 %i.052
%1 = load i32* %arrayidx6, align 4, !tbaa !6
%arrayidx9 = getelementptr inbounds %struct.state* %state, i32 0, i32 0, i32 %ch, i32 %1
%2 = load i32* %arrayidx9, align 4, !tbaa !6
%arrayidx11 = getelementptr inbounds i32* %buffer, i32 %idx.051
%3 = load i32* %arrayidx11, align 4, !tbaa !6
%add17 = add nsw i32 %1, 1
%arrayidx20 = getelementptr inbounds %struct.state* %state, i32 0, i32 0, i32 %ch, i32 %add17
%4 = load i32* %arrayidx20, align 4, !tbaa !6
%tmp = add i32 %4, %2
%tmp48 = mul i32 %tmp, %3
%add.1 = add nsw i32 %1, 2
%arrayidx9.1 = getelementptr inbounds %struct.state* %state, i32 0, i32 0, i32 %ch, i32 %add.1
%5 = load i32* %arrayidx9.1, align 4, !tbaa !6
%add10.1 = add nuw nsw i32 %idx.051, 1
%arrayidx11.1 = getelementptr inbounds i32* %buffer, i32 %add10.1
%6 = load i32* %arrayidx11.1, align 4, !tbaa !6
%add17.1 = add nsw i32 %1, 3
%arrayidx20.1 = getelementptr inbounds %struct.state* %state, i32 0, i32 0, i32 %ch, i32 %add17.1
%7 = load i32* %arrayidx20.1, align 4, !tbaa !6
%tmp.1 = add i32 %7, %5
%tmp48.1 = mul i32 %tmp.1, %6
%add24.1 = add i32 %tmp48.1, %tmp48
store volatile i32 %add24.1, i32* %ret, align 4
%inc27 = add nuw nsw i32 %i.052, 1
%add28 = add nuw nsw i32 %idx.051, 5
%exitcond53 = icmp eq i32 %inc27, 100
br i1 %exitcond53, label %for.end29, label %for.cond3.preheader

for.end29: ; preds = %for.cond3.preheader
ret void
}

attributes #0 = { nounwind ssp “less-precise-fpmad”=“false” “no-frame-pointer-elim”=“true” “no-frame-pointer-elim-non-leaf” “no-infs-fp-math”=“false” “no-nans-fp-math”=“false” “stack-protector-buffer-size”=“8” “unsafe-fp-math”=“false” “use-soft-float”=“false” }

!llvm.module.flags = !{!0}
!llvm.ident = !{!1}

!0 = !{i32 1, !“PIC Level”, i32 2}
!1 = !{!“clang version 3.6.0 (tags/RELEASE_360/final)”}
!2 = !{!3, !3, i64 0}
!3 = !{!“any pointer”, !4, i64 0}
!4 = !{!“omnipotent char”, !5, i64 0}
!5 = !{!“Simple C/C++ TBAA”}
!6 = !{!7, !7, i64 0}
!7 = !{!“int”, !4, i64 0}

And the generated .s file

.text
.macosx_version_min 10, 10
.file “ex11.ll”
.globl foo
.align 4
.type foo,@function
foo: // @foo
// BB#0: // %entry
link 16;
[fp - 4] = r4;
[fp - 8] = r5;
[fp - 12] = r6;
r3 = r1 << 2;
r4 = r0 + r3;
r3 = 0 (x);
r2 += 4;
p0 = r4;
r4 = [p0 + 256];
p0 = r2;
LBB0_1: // %for.cond3.preheader
// =>This Inner Loop Header: Depth=1
r2 = r1 << 5;
r2 = r0 + r2;
r5 = r4 + r3;
p1 = r5;
r5 = [p1];
r5 = r5 << 2;
r2 = r2 + r5;
p1 = r2; <--------------
r5 = [p1];
p1 = r2; <--------------- redundant copy
r6 = [p1 + 4];
r5 = r6 + r5;
r6 = [p0 + -4];
r5 *= r6;
p1 = r2; <--------------- redundant copy
r6 = [p1 + 8];
p1 = r2; <--------------- redundant copy
r2 = [p1 + 12];
r2 = r2 + r6;
r6 = [p0];
r2 *= r6;
r2 = r2 + r5;
[fp - 16] = r2;
r2 = p0;
r2 += 20;
r3 += 4;
r5 = 400 (z);
cc = r3 == r5;
p0 = r2;
if !cc jump LBB0_1;
jump LBB0_2;
LBB0_2: // %for.end29
r6 = [fp - 12];
r5 = [fp - 8];
r4 = [fp - 4];
unlink;
rts;
Ltmp0:
.size foo, Ltmp0-foo

Huang