[NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params

Dmitry_N_Mikushin · July 11, 2012, 1:46am

Hello,

FYI, this is a bug http://llvm.org/bugs/show_bug.cgi?id=13324

When compiling the following code for sm_20, func params are by some reason given with .align 0, which is invalid. Problem does not occur if compiled for sm_10.

cat test.ll
; ModuleID = ‘__kernelgen_main_module’
target datalayout = “e-p:64:64-i64:64:64-f64:64:64-n1:8:16:32:64”
target triple = “ptx64-unknown-unknown”

%struct.float2 = type { float, float }

define ptx_device void @__internal_dsmul(%struct.float2* noalias nocapture sret %agg.result, %struct.float2* nocapture byval %x, %struct.float2* nocapture byval %y) nounwind inlinehint alwaysinline {
entry:
%y1 = getelementptr inbounds %struct.float2* %x, i64 0, i32 1
%0 = load float* %y1, align 4
%sub = fsub float -0.000000e+00, %0
%1 = tail call float asm “mad.f32 $0, $1, $2, $3;”, “=f,f,f,f”(float %sub, float 4.097000e+03, float %0) nounwind
%2 = tail call float asm “mad.f32 $0, $1, $2, $3;”, “=f,f,f,f”(float %0, float 4.097000e+03, float %1) nounwind
%y5 = getelementptr inbounds %struct.float2* %y, i64 0, i32 1
%3 = load float* %y5, align 4
%sub7 = fsub float -0.000000e+00, %3
%4 = tail call float asm “mad.f32 $0, $1, $2, $3;”, “=f,f,f,f”(float %sub7, float 4.097000e+03, float %3) nounwind
%5 = tail call float asm “mad.f32 $0, $1, $2, $3;”, “=f,f,f,f”(float %3, float 4.097000e+03, float %4) nounwind
%sub12 = fsub float %0, %2
%sub14 = fsub float %3, %5
%6 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %3) nounwind
%sub18 = fsub float -0.000000e+00, %6
%7 = tail call float asm “mad.f32 $0, $1, $2, $3;”, “=f,f,f,f”(float %2, float %5, float %sub18) nounwind
%8 = tail call float asm “mad.f32 $0, $1, $2, $3;”, “=f,f,f,f”(float %2, float %sub14, float %7) nounwind
%9 = tail call float asm “mad.f32 $0, $1, $2, $3;”, “=f,f,f,f”(float %5, float %sub12, float %8) nounwind
%10 = tail call float asm “mad.f32 $0, $1, $2, $3;”, “=f,f,f,f”(float %sub12, float %sub14, float %9) nounwind
%x24 = getelementptr inbounds %struct.float2* %y, i64 0, i32 0
%11 = load float* %x24, align 4
%12 = tail call float @llvm.nvvm.mul.rn.f(float %0, float %11) nounwind
%x26 = getelementptr inbounds %struct.float2* %x, i64 0, i32 0
%13 = load float* %x26, align 4
%14 = tail call float @llvm.nvvm.mul.rn.f(float %13, float %3) nounwind
%add = fadd float %12, %14
%add29 = fadd float %10, %add
%15 = tail call float @llvm.nvvm.add.rn.f(float %6, float %add29) nounwind
%sub32 = fsub float %6, %15
%16 = tail call float @llvm.nvvm.add.rn.f(float %sub32, float %add29) nounwind
%agg.result.0 = getelementptr inbounds %struct.float2* %agg.result, i64 0, i32 0
store float %16, float* %agg.result.0, align 8
%agg.result.1 = getelementptr inbounds %struct.float2* %agg.result, i64 0, i32 1
store float %15, float* %agg.result.1, align 4
ret void
}

declare ptx_device float @llvm.nvvm.add.rn.f(float, float) nounwind readnone

declare ptx_device float @llvm.nvvm.mul.rn.f(float, float) nounwind readnone

llc -march=nvptx64 -mcpu=sm_20 test.ll -o test.ptx
cat test.ptx
//
// Generated by LLVM NVPTX Back-End
//

.version 3.0
.target sm_20, texmode_independent
.address_size 64

// .globl __internal_dsmul
.func __internal_dsmul(
.param .b64 __internal_dsmul_param_0,
.param .align 0 .b8 __internal_dsmul_param_1[8],
.param .align 0 .b8 __internal_dsmul_param_2[8]
) // @__internal_dsmul
{
.reg .pred %p<396>;
.reg .s16 %rc<396>;
.reg .s16 %rs<396>;
.reg .s32 %r<396>;
.reg .s64 %rl<396>;
.reg .f32 %f<396>;
.reg .f64 %fl<396>;

// BB#0: // %entry
mov.b64 %rl0, __internal_dsmul_param_1;
cvta.local.u64 %rl0, %rl0;
ld.f32 %f0, [%rl0+4];
neg.f32 %f1, %f0;
mov.b64 %rl1, __internal_dsmul_param_2;
mov.f32 %f2, 0f45800800;
// inline asm
mad.f32 %f1, %f1, %f2, %f0;
// inline asm
// inline asm
mad.f32 %f3, %f0, %f2, %f1;
// inline asm
cvta.local.u64 %rl1, %rl1;
ld.f32 %f1, [%rl1+4];
neg.f32 %f4, %f1;
// inline asm
mad.f32 %f4, %f4, %f2, %f1;
// inline asm
// inline asm
mad.f32 %f4, %f1, %f2, %f4;
// inline asm
sub.f32 %f5, %f0, %f3;
sub.f32 %f6, %f1, %f4;
mul.rn.f32 %f2, %f0, %f1;
neg.f32 %f7, %f2;
// inline asm
mad.f32 %f7, %f3, %f4, %f7;
// inline asm
// inline asm
mad.f32 %f3, %f3, %f6, %f7;
// inline asm
// inline asm
mad.f32 %f3, %f4, %f5, %f3;
// inline asm
// inline asm
mad.f32 %f3, %f5, %f6, %f3;
// inline asm
ld.f32 %f4, [%rl1];
mul.rn.f32 %f0, %f0, %f4;
ld.f32 %f4, [%rl0];
mul.rn.f32 %f1, %f4, %f1;
add.f32 %f0, %f0, %f1;
add.f32 %f1, %f3, %f0;
add.rn.f32 %f0, %f2, %f1;
sub.f32 %f2, %f2, %f0;
add.rn.f32 %f1, %f2, %f1;
ld.param.u64 %rl0, [__internal_dsmul_param_0];
st.f32 [%rl0], %f1;
st.f32 [%rl0+4], %f0;
ret;
}

ptxas -arch=sm_20 test.ptx -o ptx.cubin
ptxas test.ptx, line 13; error : Alignment must be a power of two
ptxas test.ptx, line 14; error : Alignment must be a power of two
ptxas fatal : Ptx assembly aborted due to errors

Dmitry_N_Mikushin · November 9, 2012, 12:16pm

Dear all,

I'm attaching a patch that should fix the issue mentioned above. It
simply makes the same check seen in the same file for global
variables:

  emitPTXAddressSpace(PTy->getAddressSpace(), O);
  if (GVar->getAlignment() == 0)
    O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
  else
    O << " .align " << GVar->getAlignment();

Could you please review and commit? Do you think it needs a test case?

Thanks,
- D.

dmikushin@hp2:~/forge/align0> llc -march=nvptx64 -mcpu=sm_20 align0.ll -o -
//
// Generated by LLVM NVPTX Back-End
//

.version 3.1
.target sm_20
.address_size 64

  // .globl __internal_dsmul
.visible .func __internal_dsmul(
  .param .b64 __internal_dsmul_param_0,
  .param .align 4 .b8 __internal_dsmul_param_1[8],
  .param .align 4 .b8 __internal_dsmul_param_2[8]
) // @__internal_dsmul
{
  .reg .pred %p<396>;
  .reg .s16 %rc<396>;
  .reg .s16 %rs<396>;
  .reg .s32 %r<396>;
  .reg .s64 %rl<396>;
  .reg .f32 %f<396>;
  .reg .f64 %fl<396>;

// BB#0: // %entry
  mov.b64 %rl0, __internal_dsmul_param_1;
  cvta.local.u64 %rl0, %rl0;
  ld.f32 %f0, [%rl0+4];
  neg.f32 %f1, %f0;
  mov.b64 %rl1, __internal_dsmul_param_2;
  mov.f32 %f2, 0f45800800;
  // inline asm
  mad.f32 %f1, %f1, %f2, %f0;
  // inline asm
  // inline asm
  mad.f32 %f3, %f0, %f2, %f1;
  // inline asm
  cvta.local.u64 %rl1, %rl1;
  ld.f32 %f1, [%rl1+4];
  neg.f32 %f4, %f1;
  // inline asm
  mad.f32 %f4, %f4, %f2, %f1;
  // inline asm
  // inline asm
  mad.f32 %f4, %f1, %f2, %f4;
  // inline asm
  sub.f32 %f5, %f0, %f3;
  sub.f32 %f6, %f1, %f4;
  mul.rn.f32 %f2, %f0, %f1;
  neg.f32 %f7, %f2;
  // inline asm
  mad.f32 %f7, %f3, %f4, %f7;
  // inline asm
  // inline asm
  mad.f32 %f3, %f3, %f6, %f7;
  // inline asm
  // inline asm
  mad.f32 %f3, %f4, %f5, %f3;
  // inline asm
  // inline asm
  mad.f32 %f3, %f5, %f6, %f3;
  // inline asm
  ld.f32 %f4, [%rl1];
  mul.rn.f32 %f0, %f0, %f4;
  ld.f32 %f4, [%rl0];
  mul.rn.f32 %f1, %f4, %f1;
  add.f32 %f0, %f0, %f1;
  add.f32 %f1, %f3, %f0;
  add.rn.f32 %f0, %f2, %f1;
  sub.f32 %f2, %f2, %f0;
  add.rn.f32 %f1, %f2, %f1;
  ld.param.u64 %rl0, [__internal_dsmul_param_0];
  st.f32 [%rl0], %f1;
  st.f32 [%rl0+4], %f0;
  ret;
}

align0.patch (517 Bytes)

Duncan_Sands · November 9, 2012, 12:55pm

Hi Dmitry,

> I'm attaching a patch that should fix the issue mentioned above. It

simply makes the same check seen in the same file for global
variables:

   emitPTXAddressSpace(PTy->getAddressSpace(), O);
   if (GVar->getAlignment() == 0)
     O << " .align " << (int) TD->getPrefTypeAlignment(ETy);
   else
     O << " .align " << GVar->getAlignment();

it's not quite the same because your patch uses the ABI alignment, while
in this snippet it is the preferred alignment (which is usually the same
as the ABI alignment, but may be bigger).

Could you please review and commit? Do you think it needs a test case?

Yes, it needs a testcase.

Ciao, Duncan.

Dmitry_N_Mikushin · November 9, 2012, 1:17pm

Hi Duncan,

You're right, global variables use preferred alignment. And - yes,
preferred alignment in this case is bigger: 8 instead of 4. NVIDIA's
prop. compiler gives 4. However, since CUDA 5.0 ptx modules are
linkable with each other, I think alignments for externally visible
functions and data should all follow ABI rules.

Is there a guide on making tests? I have ~5 pending patches never
accepted simply because I'm not familiar with LLVM test system :-/

- D.

Duncan_Sands · November 9, 2012, 1:23pm

Hi Dmitry,

You're right, global variables use preferred alignment. And - yes,
preferred alignment in this case is bigger: 8 instead of 4. NVIDIA's
prop. compiler gives 4. However, since CUDA 5.0 ptx modules are
linkable with each other, I think alignments for externally visible
functions and data should all follow ABI rules.

giving it an alignment of 8 does follow ABI rules: everything that is
8 byte aligned has an address that is a multiple of 4, i.e. it is also
4 byte aligned.

It would be wrong to assume that external globals have the preferred
alignment: they can only be assumed to have the ABI alignment. But
as we are aligning this variable we can give it the alignment we like
as long as it is at least 4.

In short, I think using the preferred alignment is correct in this
context.

Is there a guide on making tests? I have ~5 pending patches never
accepted simply because I'm not familiar with LLVM test system :-/

Your test would go in test/CodeGen/PTX/. It should use FileCheck,
take a look at test/CodeGen/X86/zext-trunc.ll for an example. As
there are no PTX tests at all for the moment, you need to create a
lit.local.cfg file in PTX/. Imitate the X86 one.

Ciao, Duncan.

jholewinski · November 9, 2012, 9:49pm

Test cases exist under test/CodeGen/NVPTX (name changed in May). Now that I’m back at NVIDIA, I’m going to be running through the bugzilla issues (thanks Dmitry for the reports!). I have practically the exact same patch here in my queue.

In this case, I would prefer ABI alignment for compatibility with the vendor compiler. It should work either way, but I do need to audit the codebase and tie up any issues here.

Dmitry_N_Mikushin · November 9, 2012, 9:58pm

Hi Justin,

Very glad you're finally back! How's you PhD?

Please see the attached updated patch. Since you think ABI is better,
I'm only adding a test case.

- D.

align0.patch (898 Bytes)

Duncan_Sands · November 10, 2012, 9:37am

Hi Justin,

Test cases exist under test/CodeGen/NVPTX (name changed in May).

I've deleted the empty PTX directory.

Now that I'm

back at NVIDIA, I'm going to be running through the bugzilla issues (thanks
Dmitry for the reports!). I have practically the exact same patch here in my
queue.

In this case, I would prefer ABI alignment for compatibility with the vendor
compiler.

I don't really understand this argument. If the vendor compiler is aligning to
4 (say) then some globals will have address a multiple of 4, some will have
address a multiple of 8, some will have address a multiple of 16 etc, depending
on the accidents of just where in memory they happen to be placed. For
example, if you have two 4 byte globals that follow each other in memory, and
that are 4 byte aligned, then if the first one has address a multiple of 4 then
the second will have address a multiple of 8. In short lots of variables will
be 8 byte aligned by accident. If LLVM gives them all an alignment of 8, what
does that change? OK, I will now admit that there is an effect if assumptions
are being made about globals being placed next to each other: if you declare
two globals A and B immediately after each other in the IR then the LLVM
semantics doesn't guarantee that they will be laid out one immediately after
the other in memory. But that's how it happens in practice so maybe people
are (wrongly) relying on that. Bumping up the alignment to a multiple of 8
may add extra padding between A and B, causing B to not be at the position that
such naughty people are expecting.

It should work either way, but I do need to audit the codebase and

tie up any issues here.

The IR optimizers already bump the alignment of some globals up to the
preferred alignment, check out enforceKnownAlignment in Local.cpp (it ends
up being called from instcombine).

Ciao, Duncan.

jholewinski · November 10, 2012, 2:37pm

Perhaps “compatibility” is the wrong term to use here. For now, I would like to “match” what the vendor compiler does. I don’t think using preferred alignment would hurt anything in terms of correctness, but I need to go through the entire back-end to see what effects it could have on performance (e.g. adding extra padding increases local memory usage). It could be a complete non-issue for all I know right now.

Duncan_Sands · November 10, 2012, 3:38pm

Hi Justin,

Perhaps "compatibility" is the wrong term to use here. For now, I would like to
"match" what the vendor compiler does. I don't think using preferred alignment
would hurt anything in terms of correctness, but I need to go through the entire
back-end to see what effects it could have on performance (e.g. adding extra
padding increases local memory usage). It could be a complete non-issue for all
I know right now.

don't forget that the backend defines what the preferred alignment is. You can
set it equal to the ABI alignment. If measurements show that a higher alignment
is better than you can bump the preferred alignment up to a higher value. What
I'm saying is that globals should always be output using the preferred alignment
not the ABI alignment: it is the preferred value itself you should be playing
with.

Ciao, Duncan.

Topic		Replies	Views
Preferred alignment of globals > 16bytes LLVM Dev List Archives	7	222	September 20, 2012
libc++ std::cout alignment trouble (was: Re: [llvm] r240144 - [SLP] Vectorize for all-constant entries.) Clang Frontend	18	315	January 13, 2016
[RFC] Use preferred alignment instead of ABI alignment for complete object when applicable Clang Frontend	23	359	August 28, 2020
TargetData::getPreferredAlignment(const GlobalVariable *GV) is strange ... LLVM Dev List Archives	3	132	March 10, 2011
[RFC] [X86] Emit unaligned vector moves on avx machine with option control. LLVM Dev List Archives	33	336	April 20, 2021

[NVPTX] llc -march=nvptx64 -mcpu=sm_20 generates invalid zero align for device function params

Related topics