Hi All,
Thank you very much for all the great information. This is awesome!
To circle back on Craig’s questions.
I did notice LLVM 11 behave very differently.
** Per: What does “incorrect math operations” mean?
The half is passed to the function as a float. The function does operations with other half numbers. On Windows when we don’t get the float to half conversation the input is always truncated to 0.0.
** Per: “Do you have a more complete IR file for Windows that I can take a look at?”
I can get you our IR if you want, but I think it is more convoluted than required. I was working on a unit test and I think all one needs to see the anomaly is:
define void @foo(i8, i8, i8, i8, half) {
; CHECK-I686: callq __gnu_f2h_ieee
%6 = alloca half
store half %4, half* %6, align 1
ret void
}
x86_64-pc-windows gives:
push rax
.seh_stackalloc 8
.seh_endprologue
movss xmm0, dword ptr [rsp + 48] # xmm0 = mem[0],zero,zero,zero
movss dword ptr [rsp + 4], xmm0 # 4-byte Spill
pop rax
ret
.seh_handlerdata
.text
.seh_endproc
What I find extremely interesting is the behavior seems has something to do with the stack? For dropping the inputs by one then even Windows will generate the conversion.
define void @foo(i8, i8, i8, half) {
; CHECK-I686: callq __gnu_f2h_ieee
%5 = alloca half
store half %3, half* %5, align 1
ret void
}
x86_64-pc-windows gives:
sub rsp, 40
.seh_stackalloc 40
.seh_endprologue
movabs rax, offset __gnu_f2h_ieee
movaps xmm0, xmm3
call rax
mov word ptr [rsp + 38], ax
add rsp, 40
ret
.seh_handlerdata
.text
.seh_endproc
** If interested, here is a dissection of our real asm.
For both Windows and Linux our IR calls c2_foo() with a half(2):
…
call void @c2_foo(i8* %S_6, [21 x i8*]* %ptr_gvar_instance_7, %emlrtStack* %c2_b_st_, [18 x float]* @15, half 0xH4000, [18 x i8]* %t10)
They both register this in c2_foo as:
…
%c2_in2_ = alloca half
store half %c2_in2, half* %c2_in2_, align 1
When we compile them, they both send 0x40000000 to c2_foo (a single).
The Linux c2_foo() asm addresses this with a float2half conversion:
…
mov qword ptr [rsp + 448], rdi
mov qword ptr [rsp + 440], rsi
mov qword ptr [rsp + 432], rdx
mov qword ptr [rsp + 424], rcx
movabs rcx, offset __gnu_f2h_ieee # <—Convert Here
mov qword ptr [rsp + 336], r8 # 8-byte Spill
call rcx
mov word ptr [rsp + 422], ax
mov rcx, qword ptr [rsp + 336] # 8-byte Reload
mov qword ptr [rsp + 408], rcx
mov qword ptr [rsp + 392], 0
mov qword ptr [rsp + 384], 0
mov qword ptr [rsp + 376], 0
mov qword ptr [rsp + 368], 0
mov rdx, qword ptr [rsp + 432]
mov qword ptr [rsp + 360], rdx
mov rdx, qword ptr [rsp + 432]
mov rdx, qword ptr [rdx + 8]
mov qword ptr [rsp + 352], rdx
mov rdx, qword ptr [rsp + 440]
mov rdx, qword ptr [rdx + 56]
mov qword ptr [rsp + 344], rdx
mov dword ptr [rsp + 400], 0
jmp .LBB9_9
The Windows c2_foo() asm is missing this conversion but treats the value as if it has been converted.
…
mov rax, qword ptr [rsp + 424]
movss xmm0, dword ptr [rsp + 416] # xmm0 = mem[0],zero,zero,zero # ← moves the data like it wants to convert but never does
mov qword ptr [rsp + 344], rcx
mov qword ptr [rsp + 336], rdx
mov qword ptr [rsp + 328], r8
mov qword ptr [rsp + 320], r9
mov qword ptr [rsp + 304], 0
mov qword ptr [rsp + 296], 0
mov qword ptr [rsp + 288], 0
mov qword ptr [rsp + 280], 0
mov rcx, qword ptr [rsp + 328]
mov qword ptr [rsp + 272], rcx
mov rcx, qword ptr [rsp + 328]
mov rcx, qword ptr [rcx + 8]
mov qword ptr [rsp + 264], rcx
mov rcx, qword ptr [rsp + 336]
mov rcx, qword ptr [rcx + 56]
mov qword ptr [rsp + 256], rcx
mov dword ptr [rsp + 312], 0
mov qword ptr [rsp + 248], rax # 8-byte Spill
movss dword ptr