Recently I’m playing around with LLVM intrinsics and metadata related to invariant. Specifically I’m talking about the following intrinsics and metadata:
@llvm.invariant.start
intrinsic and@llvm.invariant.end
intrinsic;!invariant.load
metadata;!invariant.group
metadata.
Although these intrinsics and metadata are not equivalent, all of them could mark a memory location as “invariant” within a specific program region. For example:
define dso_local i32 @test_intrin(i32 noundef %init) {
%var.ptr = alloca i32, align 4
store i32 %init, ptr %var.ptr, align 4
%inv = call ptr @llvm.invariant.start.p0(i64 4, ptr %var.ptr)
call void @clobber(ptr noundef nonnull %var.ptr)
%1 = load i32, ptr %var.ptr, align 4
ret i32 %1
}
define dso_local i32 @test_invariant_load(i32 noundef %init) {
%var.ptr = alloca i32, align 4
store i32 %init, ptr %var.ptr, align 4
call void @clobber(ptr noundef nonnull %var.ptr)
%1 = load i32, ptr %var.ptr, align 4, !invariant.load !0
ret i32 %1
}
define dso_local i32 @test_invariant_group(i32 noundef %init) {
%var.ptr = alloca i32, align 4
store i32 %init, ptr %var.ptr, align 4, !invariant.group !0
call void @clobber(ptr noundef nonnull %var.ptr)
%1 = load i32, ptr %var.ptr, align 4, !invariant.group !0
ret i32 %1
}
LLVM could optimize all of the 3 functions above into:
define dso_local noundef i32 @test(i32 noundef returned %init) {
%var.ptr = alloca i32, align 4
store i32 %init, ptr %var.ptr, align 4
call void @clobber(ptr noundef nonnull %var.ptr)
ret i32 %init
}
Which is reasonable since the memory location pointed to by %var.ptr
is marked as immutable. But once I added a call to @llvm.invariant.end
in the first function above:
define dso_local i32 @test_intrin(i32 noundef %init) {
%var.ptr = alloca i32, align 4
store i32 %init, ptr %var.ptr, align 4
%inv = call ptr @llvm.invariant.start.p0(i64 4, ptr %var.ptr)
call void @clobber(ptr noundef nonnull %var.ptr)
%1 = load i32, ptr %var.ptr, align 4
call void @llvm.invariant.end.p0(ptr %inv, i64 4, ptr %var.ptr)
ret i32 %1
}
The optimizer suddenly stops working and no longer be able to fold away the load
instruction. The full example is live on compiler explorer: Compiler Explorer
Is this a miss-optimization?