Hi,
I have a simple example in which I want to allocate a register on GPU. I’m using the memory attribution and specifying the private
attribute as gpu.launch
. However, the code fails to lower at ConvertGpuOpsToNVVMOps Failed (convert-gpu-to-nvvm).
Here is a simpler version of my problem:
module {
func.func @main() {
%cst = arith.constant 0.000000e+00 : f32
%c1 = arith.constant 1 : index
%c256 = arith.constant 256 : index
gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c256, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c256, %arg10 = %c1, %arg11 = %c1) private(%arg12 : memref<4xf32, 5>) {
affine.for %arg13 = 0 to 4 {
memref.store %cst, %arg12[%arg13] : memref<4xf32, 5>
}
gpu.terminator
}
return
}
}
Run command:
mlir-opt --canonicalize --cse --arith-expand --loop-invariant-code-motion --lower-affine -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 opt-level=3" $TEST_MLIR
The error
<unknown>:0: error: 'llvm.insertvalue' op Type mismatch: cannot insert '!llvm.ptr' into '!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>'
<unknown>:0: note: see current operation: %3 = "llvm.insertvalue"(%2, %1) <{position = array<i64: 0>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
IR Dump:
// -----// IR Dump After ConvertGpuOpsToNVVMOps Failed (convert-gpu-to-nvvm) //----- //
"gpu.module"() <{targets = [#nvvm.target<O = 3, chip = "sm_80">]}> ({
"llvm.func"() <{CConv = #llvm.cconv<ccc>, function_type = !llvm.func<void (f32)>, linkage = #llvm.linkage<external>, sym_name = "main_kernel", visibility_ = 0 : i64}> ({
^bb0(%arg0: f32):
%0 = "llvm.mlir.constant"() <{value = 4 : i64}> : () -> i64
%1 = "llvm.alloca"(%0) <{elem_type = f32}> : (i64) -> !llvm.ptr
%2 = "llvm.mlir.undef"() : () -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
%3 = "llvm.insertvalue"(%2, %1) <{position = array<i64: 0>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
%4 = "llvm.insertvalue"(%3, %1) <{position = array<i64: 1>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
%5 = "llvm.mlir.constant"() <{value = 0 : index}> : () -> i64
%6 = "llvm.insertvalue"(%4, %5) <{position = array<i64: 2>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
%7 = "llvm.mlir.constant"() <{value = 4 : index}> : () -> i64
%8 = "llvm.insertvalue"(%6, %7) <{position = array<i64: 3, 0>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
%9 = "llvm.mlir.constant"() <{value = 1 : index}> : () -> i64
%10 = "llvm.insertvalue"(%8, %9) <{position = array<i64: 4, 0>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
%11 = "llvm.mlir.constant"() <{value = 1 : index}> : () -> i64
%12 = "llvm.mlir.constant"() <{value = 4 : index}> : () -> i64
%13 = "llvm.mlir.constant"() <{value = 0 : index}> : () -> i64
%14 = "builtin.unrealized_conversion_cast"(%13) : (i64) -> index
"llvm.br"()[^bb1] : () -> ()
^bb1: // pred: ^bb0
"llvm.br"(%13)[^bb2] : (i64) -> ()
^bb2(%15: i64): // 2 preds: ^bb1, ^bb3
%16 = "builtin.unrealized_conversion_cast"(%15) : (i64) -> index
%17 = "builtin.unrealized_conversion_cast"(%16) : (index) -> i64
%18 = "llvm.icmp"(%17, %12) <{predicate = 2 : i64}> : (i64, i64) -> i1
"llvm.cond_br"(%18)[^bb3, ^bb4] <{operandSegmentSizes = array<i32: 1, 0, 0>}> : (i1) -> ()
^bb3: // pred: ^bb2
%19 = "llvm.extractvalue"(%10) <{position = array<i64: 1>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<5>
%20 = "llvm.getelementptr"(%19, %15) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr<5>, i64) -> !llvm.ptr<5>
"llvm.store"(%arg0, %20) <{ordering = 0 : i64}> : (f32, !llvm.ptr<5>) -> ()
%21 = "llvm.add"(%17, %11) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
%22 = "builtin.unrealized_conversion_cast"(%21) : (i64) -> index
"llvm.br"(%21)[^bb2] : (i64) -> ()
^bb4: // pred: ^bb2
"llvm.return"() : () -> ()
}) {gpu.kernel, gpu.known_block_size = array<i32: 256, 1, 1>, gpu.known_grid_size = array<i32: 256, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 256, 1, 1>} : () -> ()
"gpu.module_end"() : () -> ()
}) {sym_name = "main_kernel"} : () -> ()