A volatile question.

Hi,

I have a small procedure that works without optimisations but
doesn’t with opt -O3. It’s written in Modula3 and tests the exception
handling of the language. M3 uses traditional setjmp - longjmp for exceptions.
In the example, both raised and finally should be true.

(-------------------- Test RAISE in FINALLY with another RAISE on stack. —)

PROCEDURE P25 () =
VAR raised := FALSE; finally := FALSE;
BEGIN
TRY
TRY
raised := TRUE;
RAISE E;
FINALLY
finally := TRUE;
RAISE E;
END;
Test.check (FALSE); <NOWARN>
EXCEPT E =>
Test.checkM(raised," P25 raised");
Test.checkM(finally," P25 finally");
END;
END P25;

Here’s the generated IR - the finally block gets turned into a nested procedure
which I haven’t shown, hence the display. And setjmp has the returns_twice attribute
which doesn’t seem to make any difference.

define void @Main__P25() #0 !dbg !303 {
entry:
%raised = alloca i8, align 1
%finally = alloca i8, align 1
%tmp.181 = alloca i8*, align 8
%tmp.182 = alloca i8*, align 8
%tmp.183 = alloca i64, align 8
%tmp.184 = alloca %struct.2, align 8
%tmp.185 = alloca %struct.3, align 8
%__Display = alloca [1 x i8*]
%__Display.i8pp = bitcast [1 x i8*]* %__Display to i8**
%__NewDisplaySlot.i8pp = getelementptr inbounds i8*, i8** %__Display.i8pp, i64 0
store i8* %finally, i8** %__NewDisplaySlot.i8pp
br label %second, !dbg !304

second: ; preds = %entry
call void @llvm.dbg.declare(metadata i8* %raised, metadata !305, metadata !DIExpression()), !dbg !307
call void @llvm.dbg.declare(metadata i8* %finally, metadata !308, metadata !DIExpression()), !dbg !307
store i8 0, i8* %finally, align 1, !dbg !309
store i8 0, i8* %raised, align 1, !dbg !309
store i8 0, i8* %finally, align 1, !dbg !309
%v.58 = load i64, i64* @m3_jmpbuf_size, align 8, !dbg !309
store i64 %v.58, i64* %tmp.183, align 8, !dbg !309
%v.183 = load i64, i64* %tmp.183, align 8, !dbg !309
%umul = mul nuw i64 2, %v.183, !dbg !309
%jmpbuf_size = alloca i8, i64 %umul, !dbg !309
store i8* %jmpbuf_size, i8** %tmp.181, align 8, !dbg !309
%v.181 = load i8*, i8** %tmp.181, align 8, !dbg !309
%loophole-addr_word = ptrtoint i8* %v.181 to i64, !dbg !309
%v.1831 = load i64, i64* %tmp.183, align 8, !dbg !309
%uadd = add nuw i64 %loophole-addr_word, %v.1831, !dbg !309
%loophole-addr_word2 = inttoptr i64 %uadd to i8*, !dbg !309
store i8* %loophole-addr_word2, i8** %tmp.182, align 8, !dbg !309
br label %label_121, !dbg !310

label_121: ; preds = %second
%store_base.i8p = bitcast %struct.2* %tmp.184 to i8*, !dbg !310
%store_dest.i8p = getelementptr inbounds i8, i8* %store_base.i8p, i64 16, !dbg !310
%store_dest = bitcast i8* %store_dest.i8p to i8**, !dbg !310
store i8* getelementptr inbounds (i8, i8* bitcast (%M_Const_struct* @M_Const to i8*), i64 136), i8** %store_dest, align 8, !dbg !310
%store_base.i8p3 = bitcast %struct.2* %tmp.184 to i8*, !dbg !310
%store_dest.i8p4 = getelementptr inbounds i8, i8* %store_base.i8p3, i64 8, !dbg !310
%store_dest5 = bitcast i8* %store_dest.i8p4 to i64*, !dbg !310
store i64 0, i64* %store_dest5, align 8, !dbg !310
%pop_toadr = bitcast %struct.2* %tmp.184 to i8*, !dbg !310
call void @RTHooks__PushEFrame(i8* %pop_toadr), !dbg !310
%v.1816 = load i8*, i8** %tmp.181, align 8, !dbg !310
%store_base.i8p7 = bitcast %struct.2* %tmp.184 to i8*, !dbg !310
%store_dest.i8p8 = getelementptr inbounds i8, i8* %store_base.i8p7, i64 96, !dbg !310
%store_dest9 = bitcast i8* %store_dest.i8p8 to i8**, !dbg !310
store i8* %v.1816, i8** %store_dest9, align 8, !dbg !310
%v.18110 = load i8*, i8** %tmp.181, align 8, !dbg !310
%result = call i64 @_setjmp(i8* %v.18110), !dbg !310
%icmp = icmp ne i64 %result, 0, !dbg !310
br i1 %icmp, label %if_122, label %else_122, !dbg !310

else_122: ; preds = %label_121
%store_base.i8p11 = bitcast %struct.3* %tmp.185 to i8*, !dbg !311
%store_dest.i8p12 = getelementptr inbounds i8, i8* %store_base.i8p11, i64 16, !dbg !311
%store_dest13 = bitcast i8* %store_dest.i8p12 to i8**, !dbg !311
store i8* bitcast (void (i8*)* @Main_M3_LINE_451 to i8*), i8** %store_dest13, align 8, !dbg !311
%__static_link_from_display = bitcast i8** %__Display.i8pp to i8*, !dbg !311
%store_base.i8p14 = bitcast %struct.3* %tmp.185 to i8*, !dbg !311
%store_dest.i8p15 = getelementptr inbounds i8, i8* %store_base.i8p14, i64 24, !dbg !311
%store_dest16 = bitcast i8* %store_dest.i8p15 to i8**, !dbg !311
store i8* %__static_link_from_display, i8** %store_dest16, align 8, !dbg !311
br label %label_123, !dbg !311

label_123: ; preds = %else_122
%store_base.i8p17 = bitcast %struct.3* %tmp.185 to i8*, !dbg !311
%store_dest.i8p18 = getelementptr inbounds i8, i8* %store_base.i8p17, i64 8, !dbg !311
%store_dest19 = bitcast i8* %store_dest.i8p18 to i64*, !dbg !311
store i64 3, i64* %store_dest19, align 8, !dbg !311
%pop_toadr20 = bitcast %struct.3* %tmp.185 to i8*, !dbg !311
call void @RTHooks__PushEFrame(i8* %pop_toadr20), !dbg !311
store i8 1, i8* %raised, align 1, !dbg !312
call void @RTHooks__Raise(i8* bitcast (%M_Const_struct* @M_Const to i8*), i8* null, i8* bitcast (%M_Main_struct* @M_Main to i8*), i64 449), !dbg !313
br label %label_124, !dbg !313

label_124: ; preds = %label_123
br label %if_122, !dbg !314

if_122: ; preds = %label_124, %label_121
%v.27 = load i8, i8* %raised, align 1, !dbg !315
%zext = zext i8 %v.27 to i64, !dbg !315
%pop_trunc = trunc i64 %zext to i8, !dbg !315
call void @Test__checkM(i8 %pop_trunc, i8* getelementptr inbounds (i8, i8* bitcast (%M_Const_struct* @M_Const to i8*), i64 280)), !dbg !315

Here is a portion of the optimised IR.

%result = call i64 @_setjmp(i8* nonnull %jmpbuf_size), !dbg !465
%icmp = icmp eq i64 %result, 0, !dbg !465
br i1 %icmp, label %else_122, label %if_122, !dbg !465

else_122: ; preds = %entry
%tmp.1853.sub = getelementptr inbounds [40 x i8], [40 x i8]* %tmp.1853, i64 0, i64 0
%store_dest.i8p12 = getelementptr inbounds [40 x i8], [40 x i8]* %tmp.1853, i64 0, i64 16, !dbg !466
%store_dest13 = bitcast i8* %store_dest.i8p12 to i8**, !dbg !466
store i8* bitcast (void (i8*)* @Main_M3_LINE_451 to i8*), i8** %store_dest13, align 8, !dbg !466
%store_dest.i8p15 = getelementptr inbounds [40 x i8], [40 x i8]* %tmp.1853, i64 0, i64 24, !dbg !466
%0 = bitcast i8* %store_dest.i8p15 to i8***, !dbg !466
store i8** %__Display, i8*** %0, align 8, !dbg !466
%store_dest.i8p18 = getelementptr inbounds [40 x i8], [40 x i8]* %tmp.1853, i64 0, i64 8, !dbg !466
%store_dest19 = bitcast i8* %store_dest.i8p18 to i64*, !dbg !466
store i64 3, i64* %store_dest19, align 8, !dbg !466
call void @RTHooks__PushEFrame(i8* nonnull %tmp.1853.sub), !dbg !466
call void @llvm.dbg.value(metadata i8 1, metadata !460, metadata !DIExpression()), !dbg !462
call void @RTHooks__Raise(i8* bitcast (%M_Const_struct* @M_Const to i8*), i8* null, i8* bitcast (%M_Main_struct* @M_Main to i8*), i64 449), !dbg !467
br label %if_122, !dbg !468

if_122: ; preds = %entry, %else_122
%raised.0 = phi i8 [ 0, %entry ], [ 1, %else_122 ], !dbg !462
call void @llvm.dbg.value(metadata i8 %raised.0, metadata !460, metadata !DIExpression()), !dbg !462
call void @Test__checkM(i8 %raised.0, i8* bitcast (i8** getelementptr inbounds (%M_Const_struct, %M_Const_struct* @M_Const, i64 0, i32 39) to i8*)), !dbg !469

and the relevant bit of assembly in X86_64

leaq M_Const(%rip), %rdi
leaq M_Main(%rip), %rdx
movl $449, %ecx # imm = 0x1C1
xorl %esi, %esi
callq RTHooks__Raise@PLT
movb $1, %al <--------Note setting raised to true after longjmp
.Ltmp272:
.LBB47_3: # %if_122
#DEBUG_VALUE: P25:finally ← 0
#DEBUG_VALUE: P25:raised ← $al
.loc 1 457 0 # Main.m3:457:0
movzbl %al, %edi
leaq M_Const+280(%rip), %rsi
callq Test__checkM@PLT

The raised alloca has been optimised away. The store to raised of 1 before the RTHooks__Raise has been eliminated which means it remains false until the phi after the Raise.
I guess because it’s impossible to know that RTHooks__Raise eventually calls longjmp.

After a bit of digging, I found that volatile on loads and stores could fix the problem.
Setting it on loads alone doesn’t help but setting all stores in the procedure to volatile
does the trick.

I’m wondering if this is the right stick to use and whether it’s too big. A try except
block might be a small percentage of a function and I might be losing optimisations
elsewhere.

Thanks Peter

Hi Peter,

I'm wondering if this is the right stick to use and whether it's too big. A try except
block might be a small percentage of a function and I might be losing optimisations
elsewhere.

It's probably the only way if you stick to explicit setjmp/longjmp
style calls, but it is a very nasty hammer. I think you should try to
switch to using LLVM's expected exception path: invoke + landingpad.

For x86, LLVM's expecting to use DWARF-based exception handling; but I
couldn't think of any obvious reason why that's fatal to you. LLVM
emits extra DWARF tables, but you can just ignore (or strip) them and
carry on calling your registration and raise functions. LLVM is happy
because it knows what the real control flow of the function is.

Alternatively, if that fails it might be more backend work than you'd
like but LLVM does know about setjmp/longjmp exceptions so teaching it
about your variant is unlikely to be a huge effort.

Cheers.

Tim.