I just disassembled some of the IR before and after optimization. This example
function squares a complex number:
let zsqr(r, i) = (r*r - i*i, 2*r*i)
My compiler is generating:
define fastcc i32 @zsqr({ double, double }*, { double, double }) {
entry:
%2 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%3 = getelementptr { double, double }* %2, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %3
%4 = getelementptr { double, double }* %2, i32 0, i32 0 ; <double*>
[#uses=1]
%5 = load double* %4 ; <double> [#uses=1]
%6 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%7 = getelementptr { double, double }* %6, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %7
%8 = getelementptr { double, double }* %6, i32 0, i32 0 ; <double*>
[#uses=1]
%9 = load double* %8 ; <double> [#uses=1]
%10 = mul double %5, %9 ; <double> [#uses=1]
%11 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%12 = getelementptr { double, double }* %11, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %12
%13 = getelementptr { double, double }* %11, i32 0, i32 1 ; <double*>
[#uses=1]
%14 = load double* %13 ; <double> [#uses=1]
%15 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%16 = getelementptr { double, double }* %15, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %16
%17 = getelementptr { double, double }* %15, i32 0, i32 1 ; <double*>
[#uses=1]
%18 = load double* %17 ; <double> [#uses=1]
%19 = mul double %14, %18 ; <double> [#uses=1]
%20 = sub double %10, %19 ; <double> [#uses=1]
%21 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%22 = getelementptr { double, double }* %21, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %22
%23 = getelementptr { double, double }* %21, i32 0, i32 0 ; <double*>
[#uses=1]
%24 = load double* %23 ; <double> [#uses=1]
%25 = mul double 2.000000e+00, %24 ; <double> [#uses=1]
%26 = alloca { double, double } ; <{ double, double }*> [#uses=2]
%27 = getelementptr { double, double }* %26, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %1, { double, double }* %27
%28 = getelementptr { double, double }* %26, i32 0, i32 1 ; <double*>
[#uses=1]
%29 = load double* %28 ; <double> [#uses=1]
%30 = mul double %25, %29 ; <double> [#uses=1]
%31 = alloca { double, double } ; <{ double, double }*> [#uses=3]
%32 = getelementptr { double, double }* %31, i32 0, i32 0 ; <double*>
[#uses=1]
store double %20, double* %32
%33 = getelementptr { double, double }* %31, i32 0, i32 1 ; <double*>
[#uses=1]
store double %30, double* %33
%34 = getelementptr { double, double }* %31, i32 0 ; <{ double, double }*>
[#uses=1]
%35 = load { double, double }* %34 ; <{ double, double }> [#uses=1]
%36 = getelementptr { double, double }* %0, i32 0 ; <{ double, double }*>
[#uses=1]
store { double, double } %35, { double, double }* %36
ret i32 0
}
But those LLVM optimization passes only reduce it to:
define fastcc i32 @zsqr({ double, double }*, { double, double }) {
entry:
%2 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %2, align 8
%3 = getelementptr { double, double }* %2, i32 0, i32 0 ; <double*>
[#uses=1]
%4 = load double* %3, align 8 ; <double> [#uses=1]
%5 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %5, align 8
%6 = getelementptr { double, double }* %5, i32 0, i32 0 ; <double*>
[#uses=1]
%7 = load double* %6, align 8 ; <double> [#uses=1]
%8 = mul double %4, %7 ; <double> [#uses=1]
%9 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %9, align 8
%10 = getelementptr { double, double }* %9, i32 0, i32 1 ; <double*>
[#uses=1]
%11 = load double* %10, align 8 ; <double> [#uses=1]
%12 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %12, align 8
%13 = getelementptr { double, double }* %12, i32 0, i32 1 ; <double*>
[#uses=1]
%14 = load double* %13, align 8 ; <double> [#uses=1]
%15 = mul double %11, %14 ; <double> [#uses=1]
%16 = sub double %8, %15 ; <double> [#uses=1]
%17 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %17, align 8
%18 = getelementptr { double, double }* %17, i32 0, i32 0 ; <double*>
[#uses=1]
%19 = load double* %18, align 8 ; <double> [#uses=1]
%20 = mul double %19, 2.000000e+00 ; <double> [#uses=1]
%21 = alloca { double, double } ; <{ double, double }*> [#uses=2]
store { double, double } %1, { double, double }* %21, align 8
%22 = getelementptr { double, double }* %21, i32 0, i32 1 ; <double*>
[#uses=1]
%23 = load double* %22, align 8 ; <double> [#uses=1]
%24 = mul double %20, %23 ; <double> [#uses=1]
%25 = alloca { double, double } ; <{ double, double }*> [#uses=3]
%26 = getelementptr { double, double }* %25, i32 0, i32 0 ; <double*>
[#uses=1]
store double %16, double* %26, align 8
%27 = getelementptr { double, double }* %25, i32 0, i32 1 ; <double*>
[#uses=1]
store double %24, double* %27, align 8
%28 = load { double, double }* %25, align 8 ; <{ double, double }> [#uses=1]
store { double, double } %28, { double, double }* %0
ret i32 0
}
So the optimization passes are at least doing something but they are a long
way from generating optimal code. Does LLVM have any optimization passes that
would promote these structs out of the stack and replace the loads with
extractvalue instructions?
The ideal result is probably:
define fastcc i32 @zsqr({ double, double }*, { double, double }) {
entry:
%1 = extractvalue {double, double} %1, 0
%2 = extractvalue {double, double} %1, 1
%3 = mul double %1, %1
%4 = mul double %2, %2
%5 = sub double %3, %4
%6 = getelementptr { double, double }* %0, i32 0, i32 0
store double %5, double* %6, align 8
%7 = mul double %1, 2.0
%8 = mul double %7, %2
%9 = getelementptr { double, double }* %0, i32 0, i32 1
store double %8, double* %9, align 8
ret i32 0
}