There is an issue when lowering the GPU dialect

Hello

I lowered a simple program that allocates GPU memory and copies data.

module {
  func.func @main() {
    %cst = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
    %cst_0 = arith.constant dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>
    %0 = bufferization.to_memref %cst : tensor<2x3xf64> to memref<2x3xf64>
    %memref = gpu.alloc  () : memref<2x3xf64, 1 : i32>
    gpu.memcpy  %memref, %0 : memref<2x3xf64, 1 : i32>, memref<2x3xf64>
    %1 = bufferization.to_memref %cst_0 : tensor<3x2xf64> to memref<3x2xf64>
    %memref_1 = gpu.alloc  () : memref<3x2xf64, 1 : i32>
    gpu.memcpy  %memref_1, %1 : memref<3x2xf64, 1 : i32>, memref<3x2xf64>
    return
  }
}```



The pass-pipeline I used

--pass-pipeline="builtin.module(one-shot-bufferize{bufferize-function-boundaries=true},convert-index-to-llvm,finalize-memref-to-llvm,gpu-lower-to-nvvm-pipeline{cubin-chip=sm_86
 cubin-features=+ptx75 opt-level=3})"

The result

module {
  llvm.func @malloc(i64) -> !llvm.ptr
  llvm.mlir.global private constant @__constant_3x2xf64(dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<3 x array<2 x f64>>
  llvm.mlir.global private constant @__constant_2x3xf64(dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<2 x array<3 x f64>>
  llvm.func @main() {
    %0 = llvm.mlir.constant(64 : index) : i64
    %1 = llvm.mlir.addressof @__constant_3x2xf64 : !llvm.ptr
    %2 = llvm.mlir.addressof @__constant_2x3xf64 : !llvm.ptr
    %3 = llvm.mlir.constant(2 : index) : i64
    %4 = llvm.mlir.constant(3 : index) : i64
    %5 = llvm.mlir.constant(1 : index) : i64
    %6 = llvm.mlir.zero : !llvm.ptr
    %7 = llvm.getelementptr %2[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<3 x f64>>
    %8 = llvm.getelementptr %1[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<3 x array<2 x f64>>
    %9 = llvm.getelementptr %6[6] : (!llvm.ptr) -> !llvm.ptr, f64
    %10 = llvm.ptrtoint %9 : !llvm.ptr to i64
    %11 = llvm.add %10, %0 : i64
    %12 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr
    %13 = llvm.ptrtoint %12 : !llvm.ptr to i64
    %14 = llvm.sub %0, %5 : i64
    %15 = llvm.add %13, %14 : i64
    %16 = llvm.urem %15, %0 : i64
    %17 = llvm.sub %15, %16 : i64
    %18 = llvm.inttoptr %17 : i64 to !llvm.ptr
    %19 = llvm.mul %5, %3 : i64
    %20 = llvm.mul %19, %4 : i64
    %21 = llvm.getelementptr %6[1] : (!llvm.ptr) -> !llvm.ptr, f64
    %22 = llvm.ptrtoint %21 : !llvm.ptr to i64
    %23 = llvm.mul %20, %22 : i64
    "llvm.intr.memcpy"(%18, %7, %23) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
    %24 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr
    %25 = llvm.ptrtoint %24 : !llvm.ptr to i64
    %26 = llvm.add %25, %14 : i64
    %27 = llvm.urem %26, %0 : i64
    %28 = llvm.sub %26, %27 : i64
    %29 = llvm.inttoptr %28 : i64 to !llvm.ptr
    %30 = llvm.mul %5, %4 : i64
    %31 = llvm.mul %30, %3 : i64
    %32 = llvm.mul %31, %22 : i64
    "llvm.intr.memcpy"(%29, %8, %32) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
    llvm.return
  }
}

Then I wanted to perform a transpose on the GPU, so I added linalg.transpose, but encountered an issue during lowering.

module {
  func.func @main() {
    %cst = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
    %cst_0 = arith.constant dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>
    %0 = bufferization.to_memref %cst : tensor<2x3xf64> to memref<2x3xf64>
    %memref = gpu.alloc  () : memref<2x3xf64, 1 : i32>
    gpu.memcpy  %memref, %0 : memref<2x3xf64, 1 : i32>, memref<2x3xf64>
    %1 = bufferization.to_memref %cst_0 : tensor<3x2xf64> to memref<3x2xf64>
    %memref_1 = gpu.alloc  () : memref<3x2xf64, 1 : i32>
    gpu.memcpy  %memref_1, %1 : memref<3x2xf64, 1 : i32>, memref<3x2xf64>
    linalg.transpose ins(%memref : memref<2x3xf64, 1 : i32>) outs(%memref_1 : memref<3x2xf64, 1 : i32>) permutation = [1, 0]
    return
  }
}

The pass-pipeline I used

--pass-pipeline="builtin.module(one-shot-bufferize{bufferize-function-boundaries=true},convert-linalg-to-affine-loops,func.func(convert-affine-for-to-gpu{gpu-block-dims=1 gpu-t
hread-dims=1}),finalize-memref-to-llvm,gpu-lower-to-nvvm-pipeline{cubin-chip=sm_86 cubin-features=+ptx75 opt-level=3})"

The result

module attributes {gpu.container_module} {
  llvm.func @malloc(i64) -> !llvm.ptr
  llvm.mlir.global private constant @__constant_3x2xf64(dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<3 x array<2 x f64>>
  llvm.mlir.global private constant @__constant_2x3xf64(dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<2 x array<3 x f64>>
  llvm.func @main() {
    %0 = llvm.mlir.zero : !llvm.ptr
    %1 = llvm.mlir.addressof @__constant_2x3xf64 : !llvm.ptr
    %2 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %3 = llvm.mlir.constant(0 : index) : i64
    %4 = llvm.mlir.addressof @__constant_3x2xf64 : !llvm.ptr
    %5 = llvm.mlir.constant(64 : index) : i64
    %6 = llvm.mlir.constant(3 : index) : i64
    %7 = llvm.mlir.constant(1 : index) : i64
    %8 = llvm.mlir.constant(2 : index) : i64
    %9 = llvm.getelementptr %1[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<3 x f64>>
    %10 = llvm.getelementptr %4[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<3 x array<2 x f64>>
    %11 = llvm.getelementptr %0[6] : (!llvm.ptr) -> !llvm.ptr, f64
    %12 = llvm.ptrtoint %11 : !llvm.ptr to i64
    %13 = llvm.add %12, %5 : i64
    %14 = llvm.call @malloc(%13) : (i64) -> !llvm.ptr
    %15 = llvm.ptrtoint %14 : !llvm.ptr to i64
    %16 = llvm.sub %5, %7 : i64
    %17 = llvm.add %15, %16 : i64
    %18 = llvm.urem %17, %5 : i64
    %19 = llvm.sub %17, %18 : i64
    %20 = llvm.inttoptr %19 : i64 to !llvm.ptr
    %21 = llvm.insertvalue %14, %2[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %22 = llvm.insertvalue %20, %21[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %23 = llvm.insertvalue %3, %22[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %24 = llvm.insertvalue %8, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %25 = llvm.insertvalue %6, %24[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %26 = llvm.insertvalue %6, %25[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %27 = llvm.insertvalue %7, %26[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %28 = builtin.unrealized_conversion_cast %27 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<2x3xf64>
    %29 = llvm.mul %7, %8 : i64
    %30 = llvm.mul %29, %6 : i64
    %31 = llvm.getelementptr %0[1] : (!llvm.ptr) -> !llvm.ptr, f64
    %32 = llvm.ptrtoint %31 : !llvm.ptr to i64
    %33 = llvm.mul %30, %32 : i64
    "llvm.intr.memcpy"(%20, %9, %33) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
    %memref = gpu.alloc  () : memref<2x3xf64, 1 : i32>
    %34 = builtin.unrealized_conversion_cast %memref : memref<2x3xf64, 1 : i32> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    gpu.memcpy  %memref, %28 : memref<2x3xf64, 1 : i32>, memref<2x3xf64>
    %35 = llvm.call @malloc(%13) : (i64) -> !llvm.ptr
    %36 = llvm.ptrtoint %35 : !llvm.ptr to i64
    %37 = llvm.add %36, %16 : i64
    %38 = llvm.urem %37, %5 : i64
    %39 = llvm.sub %37, %38 : i64
    %40 = llvm.inttoptr %39 : i64 to !llvm.ptr
    %41 = llvm.insertvalue %35, %2[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %42 = llvm.insertvalue %40, %41[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %43 = llvm.insertvalue %3, %42[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %44 = llvm.insertvalue %6, %43[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %45 = llvm.insertvalue %8, %44[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %46 = llvm.insertvalue %8, %45[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %47 = llvm.insertvalue %7, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %48 = builtin.unrealized_conversion_cast %47 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x2xf64>
    %49 = llvm.mul %7, %6 : i64
    %50 = llvm.mul %49, %8 : i64
    %51 = llvm.mul %50, %32 : i64
    "llvm.intr.memcpy"(%40, %10, %51) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
    %memref_0 = gpu.alloc  () : memref<3x2xf64, 1 : i32>
    %52 = builtin.unrealized_conversion_cast %memref_0 : memref<3x2xf64, 1 : i32> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    gpu.memcpy  %memref_0, %48 : memref<3x2xf64, 1 : i32>, memref<3x2xf64>
    %53 = llvm.extractvalue %34[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %54 = llvm.extractvalue %34[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %55 = llvm.extractvalue %34[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %56 = llvm.extractvalue %34[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %57 = llvm.extractvalue %34[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %58 = llvm.extractvalue %34[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %59 = llvm.extractvalue %34[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %60 = llvm.extractvalue %52[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %61 = llvm.extractvalue %52[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %62 = llvm.extractvalue %52[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %63 = llvm.extractvalue %52[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %64 = llvm.extractvalue %52[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %65 = llvm.extractvalue %52[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %66 = llvm.extractvalue %52[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    gpu.launch_func  @main_kernel::@main_kernel blocks in (%6, %7, %7) threads in (%8, %7, %7) : i64 args(%53 : !llvm.ptr<1>, %54 : !llvm.ptr<1>, %55 : i64, %56 : i64, %57 : i64, %58 : i64, %59 : i64, %60 : !llvm.ptr<1>, %61 : !llvm.ptr<1>, %62 : i64, %63 : i64, %64 : i64, %65 : i64, %66 : i64)
    llvm.return
  }
  gpu.binary @main_kernel  [#gpu.object<#nvvm.target<O = 3, chip = "sm_86", features = "+ptx75">, "P\EDU\BA\01\00\10\00\A0\0E\00\00\00\00\00\00\02\00\01\01@\00\00\00(\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00V\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00}\00\00\00\00\00\00\00\00\00\00\00\80\0B\00\00\00\00\00\00\80\08\00\00\00\00\00\00V\05V\00@\008\00\03\00@\00\0C\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.main_kernel\00.nv.info.main_kernel\00.nv.shared.main_kernel\00.nv.constant0.main_kernel\00.rel.nv.constant0.main_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00.nv.rel.action\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.main_kernel\00.nv.info.main_kernel\00.nv.shared.main_kernel\00.rel.nv.constant0.main_kernel\00.nv.constant0.main_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00.nv.rel.action\00main_kernel|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\04\04\00\00\00\044\00\00\00\0C\81\80\80(\00\04\FC\FF\FF?\00\00\00\00\00\00\00\04/\08\00\06\00\00\00\0A\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\04\11\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\047\04\00}\00\00\00\015\00\00\04\0A\08\00\02\00\00\00`\01p\00\03\19p\00\04\17\0C\00\00\00\00\00\0D\00h\00\00\F0!\00\04\17\0C\00\00\00\00\00\0C\00`sz\01\00\00\0A\00\00\00\0F\00\00\00\E4\0F\00\19y\07\00\00\00\00\00\00!\00\00\00\22\0E\00\02x\02\00\18\00\00\00\00\0F\00\00\00\E2\0F\00\B9z\04\00\00F\00\00\00\0A\00\00\00\E4\0F\00\19y\05\00\00\00\00\00\00%\00\00\00d\0E\00%v\02\07\00Z\00\00\02\00\8E\07\00\CC\1F\00%x\02\05\08\00\00\00\02\00\8E\07\00\CC/\00\81y\02\02\04\00\00\00\00\1B\1E\0C\00\A2\0E\00%x\04\05\02\00\00\00\FF\00\8E\07\00\CA\0F\00\12r\07\04\07\00\00\00\FF\FC\8E\07\00\C8\0F\00\11z\04\07\00h\00\00\FF\18\80\07\00\C8\0F\00\11z\05\07\00i\00\00\05\1C\0F\00\00\CA\0F\00\86y\00\04\02\00\00\00\04\1B\10\0C\00\E2O\00My\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00Gy\00\00\F0\FF\FF\FF\FF\FF\83\03\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18yp\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00p\03\00\00\00\00\00\000\00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00D\00\00\00\00\00\00p@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A0\03\00\00\00\00\00\00\18\01\00\00\00\00\00\00\03\00\00\00\0B\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\00\00\00\01\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B8\04\00\00\00\00\00\00 \00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\F4\00\00\00\0B\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\04\00\00\00\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\B5\00\00\00\09\00\00\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E8\04\00\00\00\00\00\00\10\00\00\00\00\00\00\00\03\00\00\00\04\00\00\00\08\00\00\00\00\00\00\00\10\00\00\00\00\00\00\00pw\04\00\00\00\00\00\00H\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\0A//\03\00\F3\1E\0A.version 7.5\0A.target sm_86\0A.address_size 64\0A1\00\F8-isible .entry main_kernel(\0A.param .u64 .ptr .global .align 1/\00\11_-\00?_0,7\00\22\1A17\00\0E!\00\1F2!\00\0D\1F3!\00\0D\1F4!\00\0D\1F5!\00\0D\1F6\DC\00#\1F77\00#\1F8\8F\00\0D\1F9!\00\0D/10\22\00\0E\0Fx\01\0E\1F1y\01\0E\F4\1913\0A)\0A.maxntid 2, 1, 1\0A{\0A.reg .b32 %r<3>;\11\00\9564 %rd<12\13\00\10f\13\000fd<\12\00a\0Amov.u5\00\F2\001, %ctaid.x;\0Ald\86\00\22.u@\00O1, [\8C\00\00'];A\00R2, %t?\000cvt:\00\03\19\00\10d\1A\00\C4r2;\0Amul.wide\18\00\223,7\00\9224;\0Aadd.sm\00#4,s\00\00#\00\0E6\00\225,\AE\00\1985\00&6,;\00\115\C1\00\02\F4\01\04\EF\00\00\C2\00\00'\00\1E]\DD\00\1F7\DD\00\02.8]{\00\138{\00T2;\0Aor\\\01%9,\1B\00td2;\0Ashl\1A\00310,!\00\193\AC\00311,x\00\01%\00H;\0Ast\AE\00\11[#\00!],\B7\00\B0;\0Aret;\0A\0A}\0A\00\00\00">]
}

I performed lowering, but the code still contains builtin.unrealized_conversion_cast, gpu.alloc, and gpu.memcpy.

I have two questions:

  1. Can this issue be resolved by modifying the pass pipeline? If not, what should I do?
  2. I applied the gpu-lower-to-nvvm pipeline — how is it possible that GPU dialect operations are still present

Thank you for reading!

This means your lowering is incomplete and these operations are not handled.

Specifically, it looks like your code and pass pipeline don’t differentiate between host and device code. It should. NVVM is only relevant for device code. Take a look at the documentation: 'gpu' Dialect - MLIR.

1 Like

Thank you for your answer!

So after creating gpu.launch, should I separate the host and device code through -gpu-kernel-outlining and then apply the gpu-lower-to-nvvm-pipeline?

Additionally, I applied the following pass pipeline, and while it does separate the host and device code, the lowering is still incomplete.
gpu.memcpy, gpu.alloc, and builtin.unrealized_conversion_cast remain in the final output

The pass-pipeline :

mlir-opt step2.mlir  -one-shot-bufferize="bufferize-function-boundaries=1 function-boundary-type-conversion=identity-layout-map" -convert-linalg-to-parallel-loops -canonicalize -gpu-map-parallel-loops -convert-parallel-loops-to-gpu -gpu-kernel-outlining -convert-index-to-llvm -reconcile-unrealized-casts -final
ize-memref-to-llvm -gpu-to-llvm -gpu-lower-to-nvvm-pipeline="cubin-chip=sm_86 cubin-features=+ptx75 opt-level=3"  -canonicalize -cse

The result :

module attributes {gpu.container_module} {
  llvm.func @malloc(i64) -> !llvm.ptr
  llvm.mlir.global private constant @__constant_3x2xf64(dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<3 x array<2 x f64>>
  llvm.mlir.global private constant @__constant_2x3xf64(dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<2 x array<3 x f64>>
  llvm.func @main() {
    %0 = llvm.mlir.zero : !llvm.ptr
    %1 = llvm.mlir.constant(1 : index) : i64
    %2 = llvm.mlir.constant(3 : index) : i64
    %3 = llvm.mlir.constant(64 : index) : i64
    %4 = llvm.mlir.addressof @__constant_3x2xf64 : !llvm.ptr
    %5 = llvm.mlir.constant(0 : index) : i64
    %6 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %7 = llvm.mlir.addressof @__constant_2x3xf64 : !llvm.ptr
    %8 = llvm.mlir.constant(2 : index) : i64
    %9 = llvm.getelementptr %7[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<3 x f64>>
    %10 = llvm.getelementptr %4[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<3 x array<2 x f64>>
    %11 = llvm.getelementptr %0[6] : (!llvm.ptr) -> !llvm.ptr, f64
    %12 = llvm.ptrtoint %11 : !llvm.ptr to i64
    %13 = llvm.add %12, %3 : i64
    %14 = llvm.call @malloc(%13) : (i64) -> !llvm.ptr
    %15 = llvm.ptrtoint %14 : !llvm.ptr to i64
    %16 = llvm.sub %3, %1 : i64
    %17 = llvm.add %15, %16 : i64
    %18 = llvm.urem %17, %3 : i64
    %19 = llvm.sub %17, %18 : i64
    %20 = llvm.inttoptr %19 : i64 to !llvm.ptr
    %21 = llvm.insertvalue %14, %6[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %22 = llvm.insertvalue %20, %21[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %23 = llvm.insertvalue %5, %22[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %24 = llvm.insertvalue %8, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %25 = llvm.insertvalue %2, %24[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %26 = llvm.insertvalue %2, %25[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %27 = llvm.insertvalue %1, %26[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %28 = builtin.unrealized_conversion_cast %27 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<2x3xf64>
    %29 = llvm.mul %1, %8 : i64
    %30 = llvm.mul %29, %2 : i64
    %31 = llvm.getelementptr %0[1] : (!llvm.ptr) -> !llvm.ptr, f64
    %32 = llvm.ptrtoint %31 : !llvm.ptr to i64
    %33 = llvm.mul %30, %32 : i64
    "llvm.intr.memcpy"(%20, %9, %33) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
    %memref = gpu.alloc  () : memref<2x3xf64, 1 : i32>
    %34 = builtin.unrealized_conversion_cast %memref : memref<2x3xf64, 1 : i32> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    gpu.memcpy  %memref, %28 : memref<2x3xf64, 1 : i32>, memref<2x3xf64>
    %35 = llvm.call @malloc(%13) : (i64) -> !llvm.ptr
    %36 = llvm.ptrtoint %35 : !llvm.ptr to i64
    %37 = llvm.add %36, %16 : i64
    %38 = llvm.urem %37, %3 : i64
    %39 = llvm.sub %37, %38 : i64
    %40 = llvm.inttoptr %39 : i64 to !llvm.ptr
    %41 = llvm.insertvalue %35, %6[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %42 = llvm.insertvalue %40, %41[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %43 = llvm.insertvalue %5, %42[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %44 = llvm.insertvalue %2, %43[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %45 = llvm.insertvalue %8, %44[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %46 = llvm.insertvalue %8, %45[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %47 = llvm.insertvalue %1, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
    %48 = builtin.unrealized_conversion_cast %47 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x2xf64>
    %49 = llvm.mul %1, %2 : i64
    %50 = llvm.mul %49, %8 : i64
    %51 = llvm.mul %50, %32 : i64
    "llvm.intr.memcpy"(%40, %10, %51) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
    %memref_0 = gpu.alloc  () : memref<3x2xf64, 1 : i32>
    %52 = builtin.unrealized_conversion_cast %memref_0 : memref<3x2xf64, 1 : i32> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    gpu.memcpy  %memref_0, %48 : memref<3x2xf64, 1 : i32>, memref<3x2xf64>
    %53 = llvm.extractvalue %34[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %54 = llvm.extractvalue %34[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %55 = llvm.extractvalue %34[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %56 = llvm.extractvalue %34[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %57 = llvm.extractvalue %34[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %58 = llvm.extractvalue %34[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %59 = llvm.extractvalue %34[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %60 = llvm.extractvalue %52[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %61 = llvm.extractvalue %52[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %62 = llvm.extractvalue %52[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %63 = llvm.extractvalue %52[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %64 = llvm.extractvalue %52[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %65 = llvm.extractvalue %52[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    %66 = llvm.extractvalue %52[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
    gpu.launch_func  @main_kernel::@main_kernel blocks in (%2, %8, %1) threads in (%1, %1, %1) : i64 args(%1 : i64, %5 : i64, %53 : !llvm.ptr<1>, %54 : !llvm.ptr<1>, %55 : i64, %56 : i64, %57 : i64, %58 : i64, %59 : i64, %60 : !llvm.ptr<1>, %61 : !llvm.ptr<1>, %62 : i64, %63 : i64, %64 : i64, %65 : i64, %66 : i64)
    llvm.return
  }
  gpu.binary @main_kernel  [#gpu.object<#nvvm.target<O = 3, chip = "sm_86", features = "+ptx75">, "P\EDU\BA\01\00\10\00\C8\0F\00\00\00\00\00\00\02\00\01\01@\00\00\00(\0D\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00V\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00}\00\00\00\00\00\00\00\00\00\00\00\80\0C\00\00\00\00\00\00\80\09\00\00\00\00\00\00V\05V\00@\008\00\03\00@\00\0C\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.main_kernel\00.nv.info.main_kernel\00.nv.shared.main_kernel\00.nv.constant0.main_kernel\00.rel.nv.constant0.main_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00.nv.rel.action\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.main_kernel\00.nv.info.main_kernel\00.nv.shared.main_kernel\00.rel.nv.constant0.main_kernel\00.nv.constant0.main_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00.nv.rel.action\00main_kernel|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\02\00\00\00\00\00\00\04\04\00\00\00\04p\00\00\00\0C\81\80\80(\00\04\FC\FF\FF?\00\00\00\00\00\00\00\04/\08\00\06\00\00\00\08\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\04\11\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\047\04\00}\00\00\00\015\00\00\04\0A\08\00\02\00\00\00`\01\80\00\03\19\80\00\04\17\0C\00\00\00\00\00\0F\00x\00\00\F0!\00\04\17\0C\00\00\00\00\00\0E\00p\00\00\F0!\00\04\17\0C\00\00\00\00\00\0D\00h\00\00\F0!\00\04\17\0C\00\00\00\00\00\0C\00`sz\01\00\00\0A\00\00\00\0F\00\00\00\E4\0F\00\C3y\0B\00\00\00\00\00\00&\00\00\00\22\0E\00\B9z\04\00\00Z\00\00\00\0A\00\00\00\E4\0F\00\B9z\0C\00\00X\00\00\00\0A\00\00\00\E4\0F\00\82x\0E\00\18\00\00\00\00\00\00\00\00\E4\0F\00\B9z\08\00\00^\00\00\00\0A\00\00\00\E2\0F\00\C3y\0A\00\00\00\00\00\00%\00\00\00b\0E\00\A5r\06\0B\0C\00\00\00\04\00\8E\0F\00\C8\1F\00\A4r\0B\0B\0D\00\00\00\07\02\8E\0F\00\E4\0F\00\A5r\08\06\0E\00\00\00\08\00\8E\0F\00\E4\0F\00\A5r\04\0A\0C\00\00\00\04\00\8E\0F\00\E4/\00\A4x\0C\0B\18\00\00\00?\02\8E\0F\00\E4\0F\00\A4r\07\0A\0D\00\00\00\05\02\8E\0F\00\E4\0F\00\91r\08\04\08\00\00\00?\18\80\0F\00\C4\0F\00\90r\05\09\0C\00\00\00?\E0\FF\0F\00\C6\0F\00\B9z\0C\00\00F\00\00\00\0A\00\00\00\E2\0F\00\02|\02\00\08\00\00\00\00\0F\00\08\00\E2\0F\00\91r\05\04\05\00\00\00\07\1C\0F\08\00\E4\0F\00\B9z\08\00\00l\00\00\00\0A\00\00\00\C8\0F\00$~\03\FF\05\00\00\00\FF\00\8E\0F\00\CC\0F\00\81y\02\02\0C\00\00\00\00\1B\1E\0C\00\A2\0E\00\91r\05\04\08\00\00\00? \80\0F\00\C8\0F\00\91r\07\04\09\00\00\00\07$\0F\08\00\E4\0F\00\91r\04\06\05\00\00\00?\18\80\0F\00\C8\0F\00\91r\06\06\07\00\00\00\0B\1C\0F\08\00\E4\0F\00\02|\04\00\04\00\00\00\00\0F\00\08\00\C8\0F\00$~\05\FF\06\00\00\00\FF\00\8E\0F\00\CA\0F\00\86y\00\04\02\00\00\00\0C\1B\10\0C\00\E2O\00My\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00Gy\00\00\F0\FF\FF\FF\FF\FF\83\03\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18yp\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00p\03\00\00\00\00\00\000\00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00D\00\00\00\00\00\00p@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A0\03\00\00\00\00\00\008\01\00\00\00\00\00\00\03\00\00\00\0B\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\00\00\00\01\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\04\00\00\00\00\00\00 \00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\F4\00\00\00\0B\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F8\04\00\00\00\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\B5\00\00\00\09\00\00\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\08\05\00\00\00\00\00\00\10\00\00\00\00\00\00\00\03\00\00\00\04\00\00\00\08\00\00\00\00\00\00\00\10\00\00\00\00\00\00\00ph\04\00\00\00\00\00\00h\04\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\05\00\00\00\80\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\01\01P\00\00\00\10\02\00\00\00\00\00\00\0F\02\00\00@\00\00\00\05\00\07\00V\00\00\00\00\00\00\00\00\00\00\00\11 \00\00\00\00\00\00\00\00\00\00\00\00\00\00[\05\00\00\00\00\00\00H\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\0A//\03\00\F3\1E\0A.version 7.5\0A.target sm_86\0A.address_size 64\0A1\00\F8\17isible .entry main_kernel(\0A.param .u64\19\00\11_\17\00?_0,!\00\0C\1A1!\00\FF\06.ptr .global .align 17\00\00\1F27\00#\1F3\8F\00\0D\1F4!\00\0D\1F5!\00\0D\1F6!\00\0D\1F7!\00\0D\1F8\DC\00#\1F97\00#\1F1\A3\01\0F\1F1\22\00\0E\1F2\22\00\0E\0Fz\01\0E\1F1{\01\0E\F1\0115\0A)\0A.maxntid 1,\03\00\F4\04\0A{\0A.reg .b32 %r<3>;\11\00\9564 %rd<16\13\00\10f\13\00\A2fd<2>;\0A\0Aldo\00\22.u)\00N1, [u\00=0];*\00\1F2*\00\02\911];\0Amov.u\89\00\F0\011, %ctaid.x;\0Acvt<\00\03\1B\000d3,!\00\0EX\00\1F4X\00\02\193X\00\152X\00\1DyX\00\105X\00\B22;\0Amad.lo.sV\00#6,\DE\00\04z\00\1Dd#\00\177#\00\1F5#\00\03#8,)\00!24\0A\00\1E4\BF\00\1F9\17\01\03\00B\014shl\90\01310,\96\00d3;\0Aadda\00611,h\00!10_\00\02\BE\02\04\B7\01\00\A1\01\01)\00\0BQ\00\142Q\00\1A4Q\00#3,\96\00\01%\00\0A4\00\144\CF\00\0B\85\00'5,;\00h14;\0Ast\86\00\01\80\0015],\8F\00\B0;\0Aret;\0A\0A}\0A\00\00">]

So the way to debug these is to identify the pass you expect to convert the operation, run all the prior passes in one invocation of mlir-opt and run the suspected pass in a separate invocation with -debug. That usually provides information as to why transforms do or do not happen. For most but not all lowering passes, one can restrict the output to -debug-only=dialect-conversion. In this case, I see:

//===-------------------------------------------===//
Legalizing operation : 'gpu.memcpy'(0x130f25190) {
  "gpu.memcpy"(%127, %111) : (memref<3x2xf64, 1 : i32>, memref<3x2xf64>) -> ()

  * Fold {
  } -> FAILURE : unable to fold

  * Pattern : 'gpu.memcpy -> ()' {
    ** Failure : Can only convert with exactly one async dependency.
  } -> FAILURE : pattern failed to match
} -> FAILURE : no matched legalization pattern

So GPU host operations should be made async. There happens to be a pass for that, gpu-async-region. After that, I suspect you’ll have issues because of using “1 : i32” as an address space. This is not something we expect to see in the host code, and we generally should be using named address spaces instead (and passes are likely to expect that).

1 Like

The issue was resolved after applying gpu-async-region!

Thank you for your help!