Hi, I hope the title is accurate and explicit enough, if it isn’t feel free to correct it.
When lowering a gpu kernel with a memref.alloca
inside of it to rocdl, MLIR seems to generate llvm.alloca
operations for the wrong address space.
For example, for this code :
// File test.mlir
module attributes {gpu.container_module} {
llvm.func @main() {
%1 = arith.constant 1 : index
gpu.launch_func @test_func::@test_func blocks in (%1, %1, %1) threads in (%1, %1, %1)
llvm.return
}
gpu.module @test_func {
gpu.func @test_func () kernel {
%0 = memref.alloca() : memref<1xi8>
%1 = arith.constant 0 :i8
%2 = arith.constant 0 :index
memref.store %1, %0[%2] : memref<1xi8>
gpu.return
}
}
}
I’m using mlir-opt
and mlir-translate
like this to get LLVMIR :
mlir-opt -convert-gpu-to-rocdl -gpu-to-hsaco='chip=gfx906' -gpu-to-llvm test.mlir -gpu-to-llvm | mlir-translate -mlir-to-llvmir -o test.ll
Which produces the following main
function in test.ll
:
// ...
define void @main() !dbg !3 {
%1 = call ptr @mgpuModuleLoad(ptr @test_func_gpubin_cst), !dbg !7
%2 = call ptr @mgpuModuleGetFunction(ptr %1, ptr @test_func_test_func_kernel_name), !dbg !9
%3 = call ptr @mgpuStreamCreate(), !dbg !10
%4 = alloca %0, align 8, !dbg !11
%5 = alloca ptr, i32 0, align 8, !dbg !12
call void @mgpuLaunchKernel(ptr %2, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i32 0, ptr %3, ptr %5, ptr null), !dbg !13
call void @mgpuStreamSynchronize(ptr %3), !dbg !14
call void @mgpuStreamDestroy(ptr %3), !dbg !15 call void @mgpuModuleUnload(ptr %1), !dbg !16
ret void, !dbg !17
}
// ...
Then, using clang
to compile test.ll
, I get a segfault with the following stack trace :
clang-15: /home/racolin/microcard/llvm-project/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp:497: void getCopyToParts(llvm::SelectionDAG&, const llvm::SDLoc&, llvm::SDValue, llvm::SDV
alue*, unsigned int, llvm::MVT, const llvm::Value*, llvm::Optional<unsigned int>, llvm::ISD::NodeType): Assertion `NumParts == 1 && "No-op copy with multiple parts!"' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump: 0. Program arguments: /home/racolin/microcard/builds/llvm/bin/clang-15 -cc1 -triple amdgcn -S -disable-free -clear-ast-before-backend -main-file-name test.final.ll -mrelocation-model pic
-pic-level 1 -fhalf-no-semantic-interposition -mframe-pointer=all -fmath-errno -ffp-contract=on -fno-rounding-math -fno-verbose-asm -no-integrated-as -mconstructor-aliases -mllvm -treat-scal
able-fixed-error-as-warning -debugger-tuning=gdb -fno-dwarf-directory-asm -resource-dir /home/racolin/microcard/builds/llvm/lib/clang/15.0.2 -fdebug-compilation-dir=/home/racolin/microcard/tm
p -ferror-limit 19 -fgnuc-version=4.2.1 -fcolor-diagnostics -o /tmp/test-f7a861.s -x ir test.final.ll
1. Code generation
2. Running pass 'CallGraph Pass Manager' on module 'test.final.ll'.
3. Running pass 'AMDGPU DAG->DAG Pattern Instruction Selection' on function '@main'
#0 0x00007f3e13514f41 PrintStackTraceSignalHandler(void*) Signals.cpp:0:0 #1 0x00007f3e13512754 SignalHandler(int) Signals.cpp:0:0 #2 0x00007f3e17cce140 __restore_rt (/lib/x86_64-linux-gnu/libpthread.so.0+0x13140)
#3 0x00007f3e12e0ece1 raise ./signal/../sysdeps/unix/sysv/linux/raise.c:51:1
#4 0x00007f3e12df8537 abort ./stdlib/abort.c:81:7
#5 0x00007f3e12df840f get_sysdep_segment_value ./intl/loadmsgcat.c:509:8
#6 0x00007f3e12df840f _nl_load_domain ./intl/loadmsgcat.c:970:34
#7 0x00007f3e12e07662 (/lib/x86_64-linux-gnu/libc.so.6+0x31662)
#8 0x00007f3e128a08d1 getCopyToParts(llvm::SelectionDAG&, llvm::SDLoc const&, llvm::SDValue, llvm::SDValue*, unsigned int, llvm::MVT, llvm::Value const*, llvm::Optional<unsigned int>, llvm::
ISD::NodeType) SelectionDAGBuilder.cpp:0:0
#9 0x00007f3e128a5092 llvm::TargetLowering::LowerCallTo(llvm::TargetLowering::CallLoweringInfo&) const (/home/racolin/microcard/builds/llvm/lib/libLLVMSelectionDAG.so.15+0x23b092)
#10 0x00007f3e128b20af llvm::SelectionDAGBuilder::lowerInvokable(llvm::TargetLowering::CallLoweringInfo&, llvm::BasicBlock const*) (/home/racolin/microcard/builds/llvm/lib/libLLVMSelectionDAG
.so.15+0x2480af)
#11 0x00007f3e128d07de llvm::SelectionDAGBuilder::LowerCallTo(llvm::CallBase const&, llvm::SDValue, bool, bool, llvm::BasicBlock const*) (/home/racolin/microcard/builds/llvm/lib/libLLVMSelect
ionDAG.so.15+0x2667de)
#12 0x00007f3e128bfa5d llvm::SelectionDAGBuilder::visitCall(llvm::CallInst const&) (/home/racolin/microcard/builds/llvm/lib/libLLVMSelectionDAG.so.15+0x255a5d)
#13 0x00007f3e128ed5a9 llvm::SelectionDAGBuilder::visit(llvm::Instruction const&) (/home/racolin/microcard/builds/llvm/lib/libLLVMSelectionDAG.so.15+0x2835a9)
#14 0x00007f3e1296dd28 llvm::SelectionDAGISel::SelectBasicBlock(llvm::ilist_iterator<llvm::ilist_detail::node_options<llvm::Instruction, true, false, void>, false, true>, llvm::ilist_iterator
<llvm::ilist_detail::node_options<llvm::Instruction, true, false, void>, false, true>, bool&) (/home/racolin/microcard/builds/llvm/lib/libLLVMSelectionDAG.so.15+0x303d28)
#15 0x00007f3e1296ef78 llvm::SelectionDAGISel::SelectAllBasicBlocks(llvm::Function const&) (/home/racolin/microcard/builds/llvm/lib/libLLVMSelectionDAG.so.15+0x304f78)
#16 0x00007f3e12970f82 llvm::SelectionDAGISel::runOnMachineFunction(llvm::MachineFunction&) (.part.0) SelectionDAGISel.cpp:0:0
#17 0x00007f3e167d2af5 llvm::MachineFunctionPass::runOnFunction(llvm::Function&) (.part.0) MachineFunctionPass.cpp:0:0
#18 0x00007f3e1403416b llvm::FPPassManager::runOnFunction(llvm::Function&) (/home/racolin/microcard/builds/llvm/lib/libLLVMCore.so.15+0x25e16b)
#19 0x00007f3e1461dbf2 (anonymous namespace)::CGPassManager::runOnModule(llvm::Module&) CallGraphSCCPass.cpp:0:0
#20 0x00007f3e14034c13 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/racolin/microcard/builds/llvm/lib/libLLVMCore.so.15+0x25ec13)
#21 0x00007f3e16e9ecbc clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions con
st&, llvm::StringRef, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream>>) (/home/racolin/microcard/builds/llvm/lib/lib
clangCodeGen.so.15+0xdecbc)
#22 0x00007f3e172a121e clang::CodeGenAction::ExecuteAction() (/home/racolin/microcard/builds/llvm/lib/libclangCodeGen.so.15+0x4e121e)
#23 0x00007f3e15becb59 clang::FrontendAction::Execute() (/home/racolin/microcard/builds/llvm/lib/libclangFrontend.so.15+0x11ab59)
#24 0x00007f3e15b6ae16 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/home/racolin/microcard/builds/llvm/lib/libclangFrontend.so.15+0x98e16)
#25 0x00007f3e17cb7428 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/home/racolin/microcard/builds/llvm/lib/libclangFrontendTool.so.15+0x4428)
#26 0x0000000000413f0d cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/home/racolin/microcard/builds/llvm/bin/clang-15+0x413f0d)
#27 0x000000000040dae0 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) driver.cpp:0:0
#28 0x0000000000410679 clang_main(int, char**) (/home/racolin/microcard/builds/llvm/bin/clang-15+0x410679)
#29 0x00007f3e12df9d0a __libc_start_main ./csu/../csu/libc-start.c:308:16
#30 0x000000000040d06a _start (/home/racolin/microcard/builds/llvm/bin/clang-15+0x40d06a)
clang-15: error: unable to execute command: Aborted
clang-15: error: clang frontend command failed due to signal (use -v to see invocation)
clang version 15.0.2 (https://github.com/llvm/llvm-project.git 4bd3f3759259548e159aeba5c76efb9a0864e6fa)
Target: amdgcn
Thread model: posix
InstalledDir: /home/racolin/microcard/builds/llvm/bin
clang-15: note: diagnostic msg: Error generating preprocessed source(s) - no preprocessable inputs.
Modifying the above function by adding addrspace(5)
to the alloca operation and then casting the pointers solves the segfault :
// ...
define void @main() !dbg !3 {
%1 = call ptr @mgpuModuleLoad(ptr @test_func_gpubin_cst), !dbg !7
%2 = call ptr @mgpuModuleGetFunction(ptr %1, ptr @test_func_test_func_kernel_name), !dbg !9
%3 = call ptr @mgpuStreamCreate(), !dbg !10
// HERE
%a = alloca %0, align 8, addrspace(5), !dbg !11
%b = alloca ptr, i32 0, align 8, addrspace(5), !dbg !12
%4 = addrspacecast ptr addrspace(5) %a to ptr
%5 = addrspacecast ptr addrspace(5) %b to ptr
call void @mgpuLaunchKernel(ptr %2, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i32 0, ptr %3, ptr %5, ptr null), !dbg !13
call void @mgpuStreamSynchronize(ptr %3), !dbg !14
call void @mgpuStreamDestroy(ptr %3), !dbg !15 call void @mgpuModuleUnload(ptr %1), !dbg !16
ret void, !dbg !17
}
I have several questions :
- am I doing something wrong ?
- should MLIR generate alloca operations that use addrspace(5) ?
- do you know why LLVM crashes when compiling code for the AMDGPU target with allocas in addrspace(0) instead of 5 ?
Thank you for your time, I hope the post is clear enough, if it isn’t feel free to ask for more precisions.