[Codegen bug in LLVM 3.8?] br following `fcmp une` is present in ll, absent in asm

Hi,

We seem to have found a bug in the LLVM 3.8 code generator. We are using MCJIT and have isolated working.ll and broken.ll after middle-end optimizations – in the block merge128, notice that broken.ll has a fcmp une comparison to zero and a jump based on that branch:

merge128: ; preds = %true71, %false72
%_rtB_724 = load %B_repro_T*, %B_repro_T** %rtB, align 8
%590 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 38
%591 = bitcast double* %590 to i64*
%_rtB__Step31020 = load i64, i64* %591, align 1
%592 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_724, i64 0, i32 39
%593 = bitcast [4 x double]* %592 to i64*
store i64 %_rtB__Step31020, i64* %593, align 1
%_rtB_726 = load %B_repro_T*, %B_repro_T** %rtB, align 8
%594 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_726, i64 0, i32 39, i64 1
store double 0.000000e+00, double* %594, align 1
%_rtB_727 = load %B_repro_T*, %B_repro_T** %rtB, align 8
%595 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_727, i64 0, i32 39, i64 2
store double 0.000000e+00, double* %595, align 1
%_rtB_728 = load %B_repro_T*, %B_repro_T** %rtB, align 8
%596 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_728, i64 0, i32 39, i64 3
store double 0.000000e+00, double* %596, align 1
%_rtP_729 = load %P_repro_T*, %P_repro_T** %rtP, align 8
%597 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_729, i64 0, i32 149
%_rtP__Kp_Gain_n = load double, double* %597, align 1
%_rtB_730 = load %B_repro_T*, %B_repro_T** %rtB, align 8
%598 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 34, i64 0
%_rtB__Switch_k_el = load double, double* %598, align 1
%tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
%tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10061129, %tmp140
%599 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_730, i64 0, i32 35, i64 0
store double %tmp141, double* %599, align 1
%_rtP_733 = load %P_repro_T*, %P_repro_T** %rtP, align 8
%600 = getelementptr inbounds %P_repro_T, %P_repro_T* %_rtP_733, i64 0, i32 154
%_rtP__Ki_Gain_b = load double, double* %600, align 1
%_rtB_734 = load %B_repro_T*, %B_repro_T** %rtB, align 8
%601 = getelementptr inbounds %B_repro_T, %B_repro_T* %_rtB_734, i64 0, i32 34, i64 0
%_rtB__Switch_k_el735 = load double, double* %601, align 1
%tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
%602 = getelementptr inbounds %B_repro_T, %B_repro_T* %rtB_734, i64 0, i32 37, i64 0
store double %tmp142, double* %602, align 1
%rtb_Sum3_737 = load double, double* %rtb_Sum3
, align 8
%603 = fcmp une double %rtb_Sum3_737, 0.000000e+00
%_rtB_739 = load %B_repro_T*, %B_repro_T** %rtB, align 8
br i1 %603, label %true73, label %false74

Now, in broken.asm, notice the same merge128 is missing the branch instruction:

.LBB6_55: # %merge128
movq 184(%rsp), %rcx
movq %rax, 728(%rcx)
movq 184(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 184(%rsp), %rax
movq $0, 744(%rax)
movq 184(%rsp), %rax
movq $0, 752(%rax)
movq 184(%rsp), %rax
movq $0, 760(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 160(%rsp), %xmm1 # 8-byte Reload

xmm1 = mem[0],zero

addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 176(%rsp), %rax
movsd 5648(%rax), %xmm0 # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 192(%rsp), %xmm0 # xmm0 = mem[0],zero
movq 184(%rsp), %rax
xorpd %xmm1, %xmm1
ucomisd %xmm1, %xmm0
movq 672(%rax), %rcx
movq %rcx, 200(%rsp)
movd %rcx, %xmm0
addsd 120(%rax), %xmm0
movq 176(%rsp), %rcx
mulsd 5680(%rcx), %xmm0
movsd %xmm0, 768(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero

We know that this is the right instruction to be looking at, because we can line it up with working.ll and working.asm:

merge128: ; preds = %true71, %false72
%_rtB_724 = load %B_repro_T.0*, %B_repro_T.0** %rtB, align 8
%590 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 38
%591 = bitcast double* %590 to i64*
%_rtB__Step31025 = load i64, i64* %591, align 1
%592 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_724, i64 0, i32 39
%593 = bitcast [4 x double]* %592 to i64*
store i64 %_rtB__Step31025, i64* %593, align 1
%_rtB_726 = load %B_repro_T.0*, %B_repro_T.0** %rtB, align 8
%594 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_726, i64 0, i32 39, i64 1
store double 0.000000e+00, double* %594, align 1
%_rtB_727 = load %B_repro_T.0*, %B_repro_T.0** %rtB, align 8
%595 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_727, i64 0, i32 39, i64 2
store double 0.000000e+00, double* %595, align 1
%_rtB_728 = load %B_repro_T.0*, %B_repro_T.0** %rtB, align 8
%596 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_728, i64 0, i32 39, i64 3
store double 0.000000e+00, double* %596, align 1
%_rtP_729 = load %P_repro_T.2*, %P_repro_T.2** %rtP, align 8
%597 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_729, i64 0, i32 149
%_rtP__Kp_Gain_n = load double, double* %597, align 1
%_rtB_730 = load %B_repro_T.0*, %B_repro_T.0** %rtB, align 8
%598 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 34, i64 0
%_rtB__Switch_k_el = load double, double* %598, align 1
%tmp140 = fmul double %_rtP__Kp_Gain_n, %_rtB__Switch_k_el
%tmp141 = fadd double %_rtDW__broken_discrete_time_integrator1_DSTATE_el10111134, %tmp140
%599 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_730, i64 0, i32 35, i64 0
store double %tmp141, double* %599, align 1
%_rtP_733 = load %P_repro_T.2*, %P_repro_T.2** %rtP, align 8
%600 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_733, i64 0, i32 155
%_rtP__Ki_Gain_b = load double, double* %600, align 1
%_rtB_734 = load %B_repro_T.0*, %B_repro_T.0** %rtB, align 8
%601 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %_rtB_734, i64 0, i32 34, i64 0
%_rtB__Switch_k_el735 = load double, double* %601, align 1
%tmp142 = fmul double %_rtP__Ki_Gain_b, %_rtB__Switch_k_el735
%602 = getelementptr inbounds %B_repro_T.0, %B_repro_T.0* %rtB_734, i64 0, i32 37, i64 0
store double %tmp142, double* %602, align 1
%rtb_Sum3_737 = load double, double* %rtb_Sum3
, align 8
%_rtP_738 = load %P_repro_T.2*, %P_repro_T.2** %rtP, align 8
%603 = getelementptr inbounds %P_repro_T.2, %P_repro_T.2* %_rtP_738, i64 0, i32 154
%_rtP__Switch_Threshold = load double, double* %603, align 1
%604 = fcmp ogt double %rtb_Sum3_737, %_rtP__Switch_Threshold
%_rtB_740 = load %B_repro_T.0*, %B_repro_T.0** %rtB, align 8
br i1 %604, label %true73, label %false74

working.ll is a slightly different model from broken.ll, in that it loads the “zero value” from memory and does fcmp ogt instead of fcmp une. Otherwise, they’re the same. Now, let’s look at working.asm:

.LBB6_55: # %merge128
movq 184(%rsp), %rcx
movq %rax, 728(%rcx)
movq 184(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 184(%rsp), %rax
movq $0, 744(%rax)
movq 184(%rsp), %rax
movq $0, 752(%rax)
movq 184(%rsp), %rax
movq $0, 760(%rax)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 160(%rsp), %xmm1 # 8-byte Reload

xmm1 = mem[0],zero

addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 176(%rsp), %rax
movsd 5656(%rax), %xmm0 # xmm0 = mem[0],zero
movq 184(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 192(%rsp), %xmm0 # xmm0 = mem[0],zero
movq 176(%rsp), %rax
ucomisd 5648(%rax), %xmm0
movq 184(%rsp), %rcx
jbe .LBB6_56

BB#128: # %true73

movq 672(%rcx), %rdx
jmp .LBB6_129
.LBB6_56: # %false74
movq 696(%rcx), %rdx
.LBB6_129: # %merge129
movq %rdx, 200(%rsp)
movd %rdx, %xmm0
addsd 120(%rcx), %xmm0
mulsd 5688(%rax), %xmm0
movsd %xmm0, 768(%rcx)
movq 176(%rsp), %rax
movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero

Notice that the blocks true73 and false74 are completely absent in broken.asm.

If you want to generate this asm yourself, please use LLVM 3.8’s llc on the .ll files. For viewing convenience, please use a difftool to look at broken.ll versus working.ll and broken.asm versus working.asm – I’ve highlighted the differences above at merge128.

Further, we have instrumented this code to print out the value of rtb_Sum3_737, and found that it takes values other than zero, hitting both branches at execution. We would like to know if the community is aware of this bug, and which patch fixed it. Finally, see broken-latest.asm to see the output from the latest llc – the jump is present and the bug has been fixed:

.LBB6_99: # %merge128
movq 8(%rsp), %rcx
movq %rax, 728(%rcx)
movq 8(%rsp), %rax
movq 728(%rax), %rcx
movq %rcx, 736(%rax)
movq 8(%rsp), %rax
movq $0, 744(%rax)
movq 8(%rsp), %rax
movq $0, 752(%rax)
movq 8(%rsp), %rax
movq $0, 760(%rax)
movq 16(%rsp), %rax
movsd 5608(%rax), %xmm0 # xmm0 = mem[0],zero
movq 8(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd 40(%rsp), %xmm1 # 8-byte Reload

xmm1 = mem[0],zero

addsd %xmm0, %xmm1
movsd %xmm1, 672(%rax)
movq 16(%rsp), %rax
movsd 5648(%rax), %xmm0 # xmm0 = mem[0],zero
movq 8(%rsp), %rax
mulsd 648(%rax), %xmm0
movsd %xmm0, 704(%rax)
movsd 32(%rsp), %xmm0 # xmm0 = mem[0],zero
movq 8(%rsp), %rax
xorpd %xmm1, %xmm1
ucomisd %xmm1, %xmm0
jne .LBB6_100
jnp .LBB6_101
.LBB6_100: # %true73
movq 672(%rax), %rcx
jmp .LBB6_102
.LBB6_101: # %false74
movq 696(%rax), %rcx

Thanks.

Ram

broken.asm (63.5 KB)

working.asm (63.6 KB)

broken.ll (210 KB)

working.ll (216 KB)

broken-latest.asm (62.3 KB)

LLVM 3.8 was released a year ago. A lot of things have changed since then.

Are you able to reproduce the problem with the current HEAD, or with the LLVM 4.0 release candidate?

–paulr

Nope, the bug seems to be fixed in the latest LLVM.

Ram

In term of releases I observe the bug to go away in llvm 3.9.0. It seems like the next step is to bisect changes between 3.8.1 and 3.9.0. The following tool looks promising : https://github.com/llvm-mirror/zorg/blob/master/llvmbisect/docs/llvmlab_bisect.rst

Ketan