va_arg on Windows 64 bits

Hi everyone,

I'm interested in variadic functions and how llvm handles them. I
discovered that the Clang frontend is doing a great job at lowering the
va_arg (precisely __builtin_va_arg) function into target dependent
code. I have also seen the va_arg function that exist at IR level.

I found some information about va_arg (IR one) that currently does not
support all platform. But since 2009, it seems that Windows 64 bits is
partially supported. So I tried to play with it and reached the
following issue:

On Windows 64 bits, when passing arguments through a variadic function,
the first four parameters are passed by registers and the others onto
the stack. Therefore, the stack is 8 bytes aligned (I guess it's
related to the ABI).

For example, by debugging the IR code at the end, here's the result
right before the call. We clearly see the 8 bytes alignment.

rcx : <i64> -6778056391233182162
rdx : <i8*> 0x13E1A4
r8 : <i64*> 0x50f070
r9 : <i64*> 0x50d830
0x2EE070 : <i64*> 0x50d830
0x2EE078 : <i32> 16
0x2EE080 : <i32> 10
0x2EE088 : <i32> 10
0x2EE090 : <i64*> 0x50ee40

When using va_arg (IR) to retrieve these parameters, it does not
respect the alignement and tries to access the parameters like they
were contiguous in memory.

%0 = va_arg i8* %ap2, i64* ; OK
%1 = va_arg i8* %ap2, i64* ; OK
%2 = va_arg i8* %ap2, i64* ; OK (0x2EE070)
%3 = va_arg i8* %ap2, i32 ; OK (0x2EE078)
%4 = va_arg i8* %ap2, i32 ; Wrong ! 0x2EE07C
%5 = va_arg i8* %ap2, i32 ; Wrong ! 0x2EE080
%6 = va_arg i8* %ap2, i64* ; Wrong ! 0x2EE084

The result can be experienced by running the IR code at then end.

E:\test>clang test.ll -o test.exe
E:\test>test.exe
values : n2 = 16, dna = 0, dnb = 10

n2, dna and dnb are respectively the three i32 variables.

Does anyone know how to fix this? Alignment attribute on the variadic
function do nothing and the VAArgInst does not support setAlignment()
like the AllocaInstr.

During my research, I found that when a VAArgInst is being lowered
in SelectionDAG::expandVAARG(), the alignment information is retrieved
from the va_arg SDNode and the lowering is wrong (in this case).

The alignment is set in SelectionDAGBuilder::visitVAArg() where it
creates a VAArg DAG using DL.getABITypeAligment(I.getType()) which
seems to be the alignment information.

DL.getABITypeAligment(I.getType()) returns 4 if the type is i32 and 8
for i64 type. For testing, I forced it to 8 and the IR example below
worked fine.

Is there some kind of attributes to force function parameters to be
aligned contiguously? Or could it be that va_arg alignment is wrongly
made using DL.getABITypeAlignment?

Thank's in advance for your help.

Regards,

Gaƫl

Here's is the IR code for testing:

target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-windows-msvc18.0.0"

%struct.va_list = type { i8* }

$"str" = comdat any

@"str" = linkonce_odr unnamed_addr constant [38 x i8] c"values : n2 =
%d, dna = %d, dnb = %d\0A\00", comdat, align 1

declare i32 @printf(i8*, ...) #1

declare void @llvm.va_start(i8*)

declare void @llvm.va_end(i8*)

; Function Attrs: nounwind uwtable
define i32 @main() #0 {
%r = alloca i64
%a = alloca i64
%b = alloca i64
%t = alloca i64
%rPty = alloca i64*
%aPty = alloca i64*
%bPty = alloca i64*
%tPty = alloca i64*
store i64* %r, i64** %rPty
store i64* %a, i64** %aPty
store i64* %b, i64** %bPty
store i64* %t, i64** %tPty
%rLoad = load i64*, i64** %rPty
%aLoad = load i64*, i64** %aPty
%bLoad = load i64*, i64** %bPty
%tLoad = load i64*, i64** %tPty
%ret = alloca i64
%retPty = alloca i64*
store i64* %ret, i64** %retPty
%load = load i64*, i64** %retPty
%bit = bitcast i64* %load to i8*
call void (i64, i8*, ...) @variadiquefunc(i64 -6778056391233182162,
i8* %bit, i64* %rLoad, i64* %aLoad, i64* %bLoad, i32 16, i32 10, i32
10, i64* %tLoad)
ret i32 0
}

define internal void @variadiquefunc(i64 %p, i8* %pp, ...) {
entry:
%ap = alloca %struct.va_list
%ap2 = bitcast %struct.va_list* %ap to i8*
call void @llvm.va_start(i8* %ap2)
%0 = va_arg i8* %ap2, i64*
%1 = va_arg i8* %ap2, i64*
%2 = va_arg i8* %ap2, i64*
%3 = va_arg i8* %ap2, i32
%4 = va_arg i8* %ap2, i32
%5 = va_arg i8* %ap2, i32
%6 = va_arg i8* %ap2, i64*
%7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([38 x
i8], [38 x i8]* @"str", i32 0, i32 0), i32 %3, i32 %4, i32 %5)
call void @llvm.va_end(i8* %ap2)
ret void
}

attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-
precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-
math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-
size"="8" "target-cpu"="x86-64" "target-
features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-
float"="false" }
attributes #1 = { "disable-tail-calls"="false" "less-precise-
fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-
math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-
size"="8" "target-cpu"="x86-64" "target-
features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-
float"="false" }