@grypp
Hi. I was trying to get the 32 bit example running. I was able to compile PTX but now only the grid size and block size are being passed as i64 which breaks the compilation. This is the command, I was running
mlir-opt --arith-expand --lower-affine -convert-scf-to-cf --convert-cf-to-llvm=index-bitwidth=32 --convert-func-to-llvm="index-bitwidth=32" --gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 index-bitwidth=32 cubin-triple=nvptx-nvidia-cuda host-bare-ptr-calling-convention=1" test.mlir
This is the code before failing on
test.mlir:7:13: error: failed to legalize operation 'builtin.unrealized_conversion_cast' that was explicitly marked illegal
%c256 = arith.constant 256 : index
^
test.mlir:7:13: note: see current operation: %8 = "builtin.unrealized_conversion_cast"(%7) : (i32) -> index
module attributes {gpu.container_module} {
llvm.func @malloc(i64) -> !llvm.ptr
llvm.func @main() {
%0 = llvm.mlir.constant(0 : i8) : i8
%1 = llvm.mlir.constant(1 : index) : i32
%2 = llvm.mlir.constant(16 : index) : i32
%3 = llvm.mlir.constant(4 : index) : i32
%4 = llvm.mlir.constant(128 : index) : i32
%5 = llvm.mlir.constant(32 : index) : i32
%6 = llvm.mlir.constant(8 : index) : i32
%7 = llvm.mlir.constant(256 : index) : i32
%8 = builtin.unrealized_conversion_cast %7 : i32 to index
%9 = builtin.unrealized_conversion_cast %8 : index to i64
%10 = builtin.unrealized_conversion_cast %2 : i32 to index
%11 = builtin.unrealized_conversion_cast %10 : index to i64
%12 = builtin.unrealized_conversion_cast %1 : i32 to index
%13 = builtin.unrealized_conversion_cast %12 : index to i64
%14 = llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
%15 = llvm.mlir.zero : !llvm.ptr
%16 = llvm.getelementptr %15[16384] : (!llvm.ptr) -> !llvm.ptr, f32
%17 = llvm.ptrtoint %16 : !llvm.ptr to i64
%18 = llvm.call @malloc(%17) : (i64) -> !llvm.ptr
%19 = llvm.call @mgpuMemAlloc(%17, %14, %0) : (i64, !llvm.ptr, i8) -> !llvm.ptr
llvm.call @mgpuMemcpy(%19, %18, %17, %14) : (!llvm.ptr, !llvm.ptr, i64, !llvm.ptr) -> ()
%20 = llvm.call @mgpuEventCreate() : () -> !llvm.ptr
llvm.call @mgpuEventRecord(%20, %14) : (!llvm.ptr, !llvm.ptr) -> ()
%21 = llvm.call @mgpuMemAlloc(%17, %14, %0) : (i64, !llvm.ptr, i8) -> !llvm.ptr
%22 = llvm.call @mgpuEventCreate() : () -> !llvm.ptr
llvm.call @mgpuEventRecord(%22, %14) : (!llvm.ptr, !llvm.ptr) -> ()
%23 = llvm.call @mgpuStreamCreate() : () -> !llvm.ptr
llvm.call @mgpuStreamWaitEvent(%23, %22) : (!llvm.ptr, !llvm.ptr) -> ()
llvm.call @mgpuStreamWaitEvent(%23, %20) : (!llvm.ptr, !llvm.ptr) -> ()
llvm.call @mgpuEventDestroy(%22) : (!llvm.ptr) -> ()
llvm.call @mgpuEventDestroy(%20) : (!llvm.ptr) -> ()
gpu.launch_func <%23 : !llvm.ptr> @main_kernel::@main_kernel blocks in (%11, %13, %13) threads in (%9, %13, %13) : i64 args(%3 : i32, %5 : i32, %4 : i32, %6 : i32, %19 : !llvm.ptr, %21 : !llvm.ptr)
llvm.return
}
gpu.binary @main_kernel [#gpu.object<#nvvm.target<triple = "nvptx-nvidia-cuda", chip = "sm_80">,
Thank you for your help.