Failure to lower private GPU memory

Hi,

I have a simple example in which I want to allocate a register on GPU. I’m using the memory attribution and specifying the private attribute as gpu.launch. However, the code fails to lower at ConvertGpuOpsToNVVMOps Failed (convert-gpu-to-nvvm). Here is a simpler version of my problem:

module {
  func.func @main() {
    %cst = arith.constant 0.000000e+00 : f32
    %c1 = arith.constant 1 : index
    %c256 = arith.constant 256 : index
    gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c256, %arg7 = %c1, %arg8 = %c1) threads(%arg3, %arg4, %arg5) in (%arg9 = %c256, %arg10 = %c1, %arg11 = %c1) private(%arg12 : memref<4xf32, 5>) {
      affine.for %arg13 = 0 to 4 {
        memref.store %cst, %arg12[%arg13] : memref<4xf32, 5>
      }
      gpu.terminator
    }
    return
  }
}

Run command:

 mlir-opt --canonicalize --cse --arith-expand --loop-invariant-code-motion --lower-affine -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 opt-level=3"  $TEST_MLIR 

The error

<unknown>:0: error: 'llvm.insertvalue' op Type mismatch: cannot insert '!llvm.ptr' into '!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>'
<unknown>:0: note: see current operation: %3 = "llvm.insertvalue"(%2, %1) <{position = array<i64: 0>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>

IR Dump:

// -----// IR Dump After ConvertGpuOpsToNVVMOps Failed (convert-gpu-to-nvvm) //----- //
"gpu.module"() <{targets = [#nvvm.target<O = 3, chip = "sm_80">]}> ({
  "llvm.func"() <{CConv = #llvm.cconv<ccc>, function_type = !llvm.func<void (f32)>, linkage = #llvm.linkage<external>, sym_name = "main_kernel", visibility_ = 0 : i64}> ({
  ^bb0(%arg0: f32):
    %0 = "llvm.mlir.constant"() <{value = 4 : i64}> : () -> i64
    %1 = "llvm.alloca"(%0) <{elem_type = f32}> : (i64) -> !llvm.ptr
    %2 = "llvm.mlir.undef"() : () -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
    %3 = "llvm.insertvalue"(%2, %1) <{position = array<i64: 0>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
    %4 = "llvm.insertvalue"(%3, %1) <{position = array<i64: 1>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, !llvm.ptr) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
    %5 = "llvm.mlir.constant"() <{value = 0 : index}> : () -> i64
    %6 = "llvm.insertvalue"(%4, %5) <{position = array<i64: 2>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
    %7 = "llvm.mlir.constant"() <{value = 4 : index}> : () -> i64
    %8 = "llvm.insertvalue"(%6, %7) <{position = array<i64: 3, 0>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
    %9 = "llvm.mlir.constant"() <{value = 1 : index}> : () -> i64
    %10 = "llvm.insertvalue"(%8, %9) <{position = array<i64: 4, 0>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>, i64) -> !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
    %11 = "llvm.mlir.constant"() <{value = 1 : index}> : () -> i64
    %12 = "llvm.mlir.constant"() <{value = 4 : index}> : () -> i64
    %13 = "llvm.mlir.constant"() <{value = 0 : index}> : () -> i64
    %14 = "builtin.unrealized_conversion_cast"(%13) : (i64) -> index
    "llvm.br"()[^bb1] : () -> ()
  ^bb1:  // pred: ^bb0
    "llvm.br"(%13)[^bb2] : (i64) -> ()
  ^bb2(%15: i64):  // 2 preds: ^bb1, ^bb3
    %16 = "builtin.unrealized_conversion_cast"(%15) : (i64) -> index
    %17 = "builtin.unrealized_conversion_cast"(%16) : (index) -> i64
    %18 = "llvm.icmp"(%17, %12) <{predicate = 2 : i64}> : (i64, i64) -> i1
    "llvm.cond_br"(%18)[^bb3, ^bb4] <{operandSegmentSizes = array<i32: 1, 0, 0>}> : (i1) -> ()
  ^bb3:  // pred: ^bb2
    %19 = "llvm.extractvalue"(%10) <{position = array<i64: 1>}> : (!llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>) -> !llvm.ptr<5>
    %20 = "llvm.getelementptr"(%19, %15) <{elem_type = f32, rawConstantIndices = array<i32: -2147483648>}> : (!llvm.ptr<5>, i64) -> !llvm.ptr<5>
    "llvm.store"(%arg0, %20) <{ordering = 0 : i64}> : (f32, !llvm.ptr<5>) -> ()
    %21 = "llvm.add"(%17, %11) <{overflowFlags = #llvm.overflow<none>}> : (i64, i64) -> i64
    %22 = "builtin.unrealized_conversion_cast"(%21) : (i64) -> index
    "llvm.br"(%21)[^bb2] : (i64) -> ()
  ^bb4:  // pred: ^bb2
    "llvm.return"() : () -> ()
  }) {gpu.kernel, gpu.known_block_size = array<i32: 256, 1, 1>, gpu.known_grid_size = array<i32: 256, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 256, 1, 1>} : () -> ()
  "gpu.module_end"() : () -> ()
}) {sym_name = "main_kernel"} : () -> ()

The address space enum is incorrect. (And yes, it can be confusing :sweat_smile:)

Try replacing
memref<4xf32, 5>
with
memref<4xf32, #gpu.address_space<private>>

2 Likes

Yeah, Nvidea uses address space 0 for private memory, while AMD uses address space 5, hence the GPU memory space attributes.