NVPTX problems with dynamic alloca

I seem to not be able to use dynamic alloca correctly but don’t see anything wrong in my IR.
The IR (see below) does not originate from a higher language but was built by my application via the IRBuider.

There are two uses of alloca:

  1. a small static allocation which is located in the entryblock. (I read this is the correct way)
  2. a dynamic allocation %14 which is outside any loop.

llc -march=nvptx64 -mcpu=sm_75 < dynalloc.ll

Output below.

Am I using the dynamic alloca incorrectly or is it not supported in NVPTX?

//
// Generated by LLVM NVPTX Back-End
//

.version 6.3
.target sm_75
.address_size 64

LLVM ERROR: Cannot select: 0x5645e59e9180: i64,ch = dynamic_stackalloc 0x5645e59e9c10:1, 0x5645e59e9118, Constant:i64<0>
0x5645e59e9118: i64 = and 0x5645e59e9048, Constant:i64<34359738360>
0x5645e59e9048: i64 = add nuw 0x5645e59e9800, Constant:i64<7>
0x5645e59e9800: i64 = NVPTXISD::MUL_WIDE_UNSIGNED 0x5645e59e95f8, Constant:i32<4>
0x5645e59e95f8: i32,ch = CopyFromReg 0x5645e59ac2f8, Register:i32 %14
0x5645e59e93f0: i32 = Register %14
0x5645e59e8e40: i32 = Constant<4>
0x5645e59e9b40: i64 = Constant<7>
0x5645e59e8d70: i64 = Constant<34359738360>
0x5645e59e8d08: i64 = Constant<0>
In function: eval0
PLEASE submit a bug report to Issues · llvm/llvm-project · GitHub and include the crash backtrace.
Stack dump:
0. Program arguments: llc -march=nvptx64 -mcpu=sm_75

  1. Running pass ‘Function Pass Manager’ on module ‘’.
  2. Running pass ‘NVPTX DAG->DAG Pattern Instruction Selection’ on function ‘@eval0
    #0 0x00007ff4eb5de818 PrintStackTraceSignalHandler(void*) Signals.cpp:0:0
    #1 0x00007ff4eb5dbe6b SignalHandler(int) Signals.cpp:0:0
    #2 0x00007ff4eac42520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
    #3 0x00007ff4eac96a7c pthread_kill (/lib/x86_64-linux-gnu/libc.so.6+0x96a7c)
    #4 0x00007ff4eac42476 gsignal (/lib/x86_64-linux-gnu/libc.so.6+0x42476)
    #5 0x00007ff4eac287f3 abort (/lib/x86_64-linux-gnu/libc.so.6+0x287f3)
    #6 0x00007ff4eb461bb5 llvm::json::operator==(llvm::json::Value const&, llvm::json::Value const&) (.cold) JSON.cpp:0:0
    #7 0x00007ff4ed4b663a llvm::SelectionDAGISel::CannotYetSelect(llvm::SDNode*) (/home/toolchain/install/llvm-15.0.7-nvptx-release/lib/libLLVMSelectionDAG.so.15+0x2b663a)
    #8 0x00007ff4ed4b947a llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int) (/home/toolchain/install/llvm-15.0.7-nvptx-release/lib/libLLVMSelectionDAG.so.15+0x2b947a)
    #9 0x00007ff4ed4b38f6 llvm::SelectionDAGISel::DoInstructionSelection() (/home/toolchain/install/llvm-15.0.7-nvptx-release/lib/libLLVMSelectionDAG.so.15+0x2b38f6)
    #10 0x00007ff4ed4be01c llvm::SelectionDAGISel::CodeGenAndEmitDAG() (/home/toolchain/install/llvm-15.0.7-nvptx-release/lib/libLLVMSelectionDAG.so.15+0x2be01c)
    #11 0x00007ff4ed4c0c88 llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (/home/toolchain/install/llvm-15.0.7-nvptx-release/lib/libLLVMSelectionDAG.so.15+0x2c0c88)
    #12 0x00007ff4ed4c3567 llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (.part.0) SelectionDAGISel.cpp:0:0
    #13 0x00007ff4eccde7f3 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (.part.0) MachineFunctionPass.cpp:0:0
    #14 0x00007ff4eba35470 llvm::FPPassManager::runOnFunction(llvm::Function&) (/home/toolchain/install/llvm-15.0.7-nvptx-release/lib/libLLVMCore.so.15+0x235470)
    #15 0x00007ff4eba355a4 llvm::FPPassManager::runOnModule(llvm::Module&) (/home/toolchain/install/llvm-15.0.7-nvptx-release/lib/libLLVMCore.so.15+0x2355a4)
    #16 0x00007ff4eba35f3d llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/toolchain/install/llvm-15.0.7-nvptx-release/lib/libLLVMCore.so.15+0x235f3d)
    #17 0x00005645e53d466f compileModule(char**, llvm::LLVMContext&) llc.cpp:0:0
    #18 0x00005645e53cc8c6 main (/home/toolchain/install/llvm-15.0.7-nvptx-release/bin/llc+0xb8c6)
    #19 0x00007ff4eac29d90 (/lib/x86_64-linux-gnu/libc.so.6+0x29d90)
    #20 0x00007ff4eac29e40 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x29e40)
    #21 0x00005645e53cd015 _start (/home/toolchain/install/llvm-15.0.7-nvptx-release/bin/llc+0xc015)
    Aborted (core dumped)
; ModuleID = 'module'
source_filename = "module"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"

define void @eval0(i32 %arg0, i32 %arg1, ptr %arg2, ptr %arg3, i32 %arg4, ptr %arg5, i32 %arg6, ptr %arg7, i32 %arg8) {
stack:
  %0 = alloca float, align 4
  br label %afterstack

afterstack:                                       ; preds = %stack
  %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
  %3 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %4 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
  %5 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
  %6 = mul i32 %4, %5
  %7 = add nsw i32 %6, %3
  %8 = mul i32 %7, %2
  %9 = add nsw i32 %8, %1
  %10 = icmp sge i32 %9, %arg0
  br i1 %10, label %L0, label %L1

L0:                                               ; preds = %afterstack
  ret void

L1:                                               ; preds = %afterstack
  %11 = getelementptr i32, ptr %arg2, i32 %9
  %12 = load i32, ptr %11, align 4
  %13 = mul i32 %arg8, 1
  %14 = alloca float, i32 %13, align 4
  br label %L2

L2:                                               ; preds = %L3, %L1
  %15 = phi i32 [ 0, %L1 ], [ %36, %L3 ]
  %16 = icmp slt i32 %15, %arg8
  br i1 %16, label %L3, label %L4

L3:                                               ; preds = %L2
  %17 = mul i32 0, %arg8
  %18 = add nsw i32 %17, %15
  %19 = mul i32 %18, %arg1
  %20 = add nsw i32 %19, %12
  %21 = getelementptr float, ptr %arg7, i32 %20
  %22 = load float, ptr %21, align 4
  %23 = mul i32 0, %arg6
  %24 = add nsw i32 %23, %15
  %25 = mul i32 %24, %arg1
  %26 = add nsw i32 %25, %12
  %27 = getelementptr float, ptr %arg5, i32 %26
  %28 = load float, ptr %27, align 4
  %29 = fadd float %22, %28
  %30 = getelementptr float, ptr %0, i32 0
  store float %29, ptr %30, align 4
  %31 = getelementptr float, ptr %0, i32 0
  %32 = load float, ptr %31, align 4
  %33 = mul i32 0, %arg8
  %34 = add nsw i32 %33, %15
  %35 = getelementptr float, ptr %14, i32 %34
  store float %32, ptr %35, align 4
  %36 = add nsw i32 %15, 1
  br label %L2

L4:                                               ; preds = %L2
  ret void
}

; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #0

; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0

; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() #0

; Function Attrs: nocallback nofree nosync nounwind readnone speculatable willreturn
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() #0

attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn }

Technically recent PTX versions (PTX 7.3+) do have dynamic alloca instruction, but we do not have it implemented/supported in LLVM yet.

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#stack-manipulation-instructions-alloca

–Artem

2 Likes