Hello,
I have source code that has two similar functions process1
and process2
:
struct CMPLX
{
float real;
float imag;
};
float process1()
{
CMPLX a;
a = {1.1f, 2.2f};
return a.real;
}
float process2()
{
CMPLX a = {1.1f, 2.2f};
return a.real;
}
Here is a file with the module of the shown source code:
example.txt (2.6 KB)
The unoptimized IR corresponding to those functions is:
; Function Attrs: mustprogress noinline nounwind uwtable
define dso_local noundef float @_Z8process1v() #0 {
entry:
%a = alloca %struct.CMPLX, align 4
%ref.tmp = alloca %struct.CMPLX, align 4
%real = getelementptr inbounds %struct.CMPLX, %struct.CMPLX* %ref.tmp, i32 0, i32 0
store float 0x3FF19999A0000000, float* %real, align 4
%imag = getelementptr inbounds %struct.CMPLX, %struct.CMPLX* %ref.tmp, i32 0, i32 1
store float 0x40019999A0000000, float* %imag, align 4
%0 = bitcast %struct.CMPLX* %a to i8*
%1 = bitcast %struct.CMPLX* %ref.tmp to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 8, i1 false)
%real1 = getelementptr inbounds %struct.CMPLX, %struct.CMPLX* %a, i32 0, i32 0
%2 = load float, float* %real1, align 4
ret float %2
}
and
; Function Attrs: mustprogress noinline nounwind uwtable
define dso_local noundef float @_Z8process2v() #0 {
entry:
%a = alloca %struct.CMPLX, align 4
%0 = bitcast %struct.CMPLX* %a to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 bitcast (%struct.CMPLX* @__const._Z8process2v.a to i8*), i64 8, i1 false)
%real = getelementptr inbounds %struct.CMPLX, %struct.CMPLX* %a, i32 0, i32 0
%1 = load float, float* %real, align 4
ret float %1
}
If I optimize this IR with the instcombine
pass (I executed opt -S -instcombine example.txt
with the attached file), the process2
function is optimized to
; Function Attrs: mustprogress noinline nounwind uwtable
define dso_local noundef float @_Z8process2v() #0 {
entry:
ret float 0x3FF19999A0000000
}
which is expected, while the process1
function is not optimized.
So far I could not achieve a similar optimization for process1
with generic llvm passes. Is there a way to get this optimization done? Or are there reasons why this function should not be optimized?