Issue with X86FrameLowering __chkstk on Windows 8 64-bit / Visual Studio 2012

Hi,

I'm using LLVM to convert expressions to native assembly, the problem
is when LLVM compiles this code:

define void @fn_0000000000000000(i8*, i8*, i8*) {
bb:
  %res = alloca i32
  %3 = load i32* %res
  %4 = bitcast i8* %0 to i32*
  %5 = load i32* %4
  %6 = bitcast i8* %0 to i32*
  %7 = load i32* %6
  %8 = xor i32 %5, %7
  store volatile i32 %8, i32* %res
  %9 = load i32* %res
  %10 = icmp eq i32 %9, 0
  br i1 %10, label %then, label %else

merged: ; preds = %else, %then
  %11 = load i32* %res
  %12 = and i32 %11, -2147483648
  %13 = icmp eq i32 %12, 0
  br i1 %13, label %then3, label %else4

then: ; preds = %bb
  %zf = alloca i1
  %14 = load i1* %zf
  %15 = getelementptr i8* %0, i32 148
  %16 = bitcast i8* %15 to i1*
  %17 = load i1* %16
  store volatile i1 true, i1* %16
  br label %merged

else: ; preds = %bb
  %zf1 = alloca i1
  %18 = load i1* %zf1
  %19 = getelementptr i8* %0, i32 148
  %20 = bitcast i8* %19 to i1*
  %21 = load i1* %20
  store volatile i1 false, i1* %20
  br label %merged

merged2: ; preds = %else4, %then3
  %22 = bitcast i8* %0 to i32*
  %23 = load i32* %22
  %24 = load i32* %res
  store volatile i32 %24, i32* %22
  %af = alloca i1
  %25 = load i1* %af
  %26 = getelementptr i8* %0, i32 148
  %27 = bitcast i8* %26 to i1*
  %28 = load i1* %27
  store volatile i1 false, i1* %27
  %of = alloca i1
  %29 = load i1* %of
  %30 = getelementptr i8* %0, i32 148
  %31 = bitcast i8* %30 to i1*
  %32 = load i1* %31
  store volatile i1 false, i1* %31
  %cf = alloca i1
  %33 = load i1* %cf
  %34 = getelementptr i8* %0, i32 148
  %35 = bitcast i8* %34 to i1*
  %36 = load i1* %35
  store volatile i1 false, i1* %35
  %37 = getelementptr i8* %0, i32 64
  %38 = bitcast i8* %37 to i32*
  %39 = load i32* %38
  %40 = getelementptr i8* %0, i32 64
  %41 = bitcast i8* %40 to i32*
  %42 = load i32* %41
  %43 = add i32 %42, 2
  store volatile i32 %43, i32* %38
  ret void

then3: ; preds = %merged
  %sf = alloca i1
  %44 = load i1* %sf
  %45 = getelementptr i8* %0, i32 148
  %46 = bitcast i8* %45 to i1*
  %47 = load i1* %46
  store volatile i1 false, i1* %46
  br label %merged2

else4: ; preds = %merged
  %sf5 = alloca i1
  %48 = load i1* %sf5
  %49 = getelementptr i8* %0, i32 148
  %50 = bitcast i8* %49 to i1*
  %51 = load i1* %50
  store volatile i1 true, i1* %50
  br label %merged2
}

It generates the following assembly:
0000000581D30010 push rbp
0000000581D30011 mov rbp,rsp
0000000581D30014 sub rsp,10h
0000000581D30018 mov dword ptr [rbp-4],0
0000000581D3001F mov al,1
0000000581D30021 test al,al
0000000581D30023 jne 0000000581D30042
0000000581D30029 mov eax,10h
0000000581D3002E call 00000005F08425D0
0000000581D30033 sub rsp,rax
0000000581D30036 mov byte ptr [rcx+94h],0
0000000581D3003D jmp 0000000581D30056
0000000581D30042 mov eax,10h
0000000581D30047 call 00000005F08425D0
0000000581D3004C sub rsp,rax
0000000581D3004F mov byte ptr [rcx+94h],1
0000000581D30056 test byte ptr [rbp-1],80h
0000000581D3005A je 0000000581D30079
0000000581D30060 mov eax,10h
0000000581D30065 call 00000005F08425D0
0000000581D3006A sub rsp,rax
0000000581D3006D mov byte ptr [rcx+94h],1
0000000581D30074 jmp 0000000581D3008D
0000000581D30079 mov eax,10h
0000000581D3007E call 00000005F08425D0
0000000581D30083 sub rsp,rax
0000000581D30086 mov byte ptr [rcx+94h],0
0000000581D3008D mov eax,dword ptr [rbp-4]
0000000581D30090 mov dword ptr [rcx],eax
0000000581D30092 mov eax,10h
0000000581D30097 call 00000005F08425D0
0000000581D3009C sub rsp,rax
0000000581D3009F mov byte ptr [rcx+94h],0
0000000581D300A6 mov eax,10h
0000000581D300AB call 00000005F08425D0
0000000581D300B0 sub rsp,rax
0000000581D300B3 mov byte ptr [rcx+94h],0
0000000581D300BA mov eax,10h
0000000581D300BF call 00000005F08425D0
0000000581D300C4 sub rsp,rax
0000000581D300C7 mov byte ptr [rcx+94h],0
0000000581D300CE add dword ptr [rcx+40h],2
0000000581D300D2 mov rsp,rbp
0000000581D300D5 pop rbp
0000000581D300D6 ret

The function located at 0x00000005F08425D0 is not valid (according to
visual studio: 00000005F08425D0 ?? ??).

If I compile LLVM bytecode using llc, this function is __chkstk:
        .def fn_0000000000000000;
        .scl 2;
        .type 32;
        .endef
        .text
        .globl fn_0000000000000000
        .align 16, 0x90
fn_0000000000000000: # @fn_0000000000000000
# BB#0: # %bb
        push rbp
        mov rbp, rsp
        sub rsp, 16
        mov dword ptr [rbp - 4], 0
        mov al, 1
        test al, al
        jne .LBB0_1
# BB#2: # %else
        mov eax, 16
        call __chkstk
        sub rsp, rax
        mov byte ptr [rcx + 148], 0
        jmp .LBB0_3
.LBB0_1: # %then
        mov eax, 16
        call __chkstk
        sub rsp, rax
        mov byte ptr [rcx + 148], 1
.LBB0_3: # %merged
        test byte ptr [rbp - 1], -128
        je .LBB0_4
# BB#5: # %else4
        mov eax, 16
        call __chkstk
        sub rsp, rax
        mov byte ptr [rcx + 148], 1
        jmp .LBB0_6
.LBB0_4: # %then3
        mov eax, 16
        call __chkstk
        sub rsp, rax
        mov byte ptr [rcx + 148], 0
.LBB0_6: # %merged2
        mov eax, dword ptr [rbp - 4]
        mov dword ptr [rcx], eax
        mov eax, 16
        call __chkstk
        sub rsp, rax
        mov byte ptr [rcx + 148], 0
        mov eax, 16
        call __chkstk
        sub rsp, rax
        mov byte ptr [rcx + 148], 0
        mov eax, 16
        call __chkstk
        sub rsp, rax
        mov byte ptr [rcx + 148], 0
        add dword ptr [rcx + 64], 2
        mov rsp, rbp
        pop rbp
        ret

It seems this issue has already been described here
https://groups.google.com/forum/#!topic/llvm-commit/htNjwbWsNe8

I'm using this code
https://github.com/wisk/medusa/blob/master/src/emul/llvm/llvm_emulator.cpp
which is pretty basic.

Please, tell me if you need further information about this issue.

It’s not a solution to the actual bug (which is, as the thread you linked discusses, a problem with the assumption on LLVM’s part that the __chkstk function lies within 2GB of the emitted code’s address space) but there is a simple workaround: hoist all allocas to the first basic block of your function. This allows the JIT to perform all stack allocations in a single adjustment of the SP instead of needing to use dynamic stack allocation, and thereby avoids the call to __chkstk entirely.

Hi Michael,

It's not a solution to the actual bug (which is, as the thread you linked
discusses, a problem with the assumption on LLVM's part that the __chkstk
function lies within 2GB of the emitted code's address space) but there is a
simple workaround: hoist all allocas to the first basic block of your
function. This allows the JIT to perform all stack allocations in a single
adjustment of the SP instead of needing to use dynamic stack allocation, and
thereby avoids the call to __chkstk entirely.

__chkstk is not connected with dynamic stack allocation at all, in
general. On Windows when one allocates more than 1 page (4kb) of
stack, it is necessary to touch all all the allocated space in order
to ensure the proper order of guard page allocation. Surely, it's
always required for dynamic stack allocation, because the amount of
allocation is not known in advance, however, it can be triggered for
static code as well. Consider e.g.

void bar(int*);
void baz() {
  int foo[2000];
  bar(foo);
}