Hello
I lowered a simple program that allocates GPU memory and copies data.
module {
func.func @main() {
%cst = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
%cst_0 = arith.constant dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>
%0 = bufferization.to_memref %cst : tensor<2x3xf64> to memref<2x3xf64>
%memref = gpu.alloc () : memref<2x3xf64, 1 : i32>
gpu.memcpy %memref, %0 : memref<2x3xf64, 1 : i32>, memref<2x3xf64>
%1 = bufferization.to_memref %cst_0 : tensor<3x2xf64> to memref<3x2xf64>
%memref_1 = gpu.alloc () : memref<3x2xf64, 1 : i32>
gpu.memcpy %memref_1, %1 : memref<3x2xf64, 1 : i32>, memref<3x2xf64>
return
}
}```
The pass-pipeline I used
--pass-pipeline="builtin.module(one-shot-bufferize{bufferize-function-boundaries=true},convert-index-to-llvm,finalize-memref-to-llvm,gpu-lower-to-nvvm-pipeline{cubin-chip=sm_86
cubin-features=+ptx75 opt-level=3})"
The result
module {
llvm.func @malloc(i64) -> !llvm.ptr
llvm.mlir.global private constant @__constant_3x2xf64(dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<3 x array<2 x f64>>
llvm.mlir.global private constant @__constant_2x3xf64(dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<2 x array<3 x f64>>
llvm.func @main() {
%0 = llvm.mlir.constant(64 : index) : i64
%1 = llvm.mlir.addressof @__constant_3x2xf64 : !llvm.ptr
%2 = llvm.mlir.addressof @__constant_2x3xf64 : !llvm.ptr
%3 = llvm.mlir.constant(2 : index) : i64
%4 = llvm.mlir.constant(3 : index) : i64
%5 = llvm.mlir.constant(1 : index) : i64
%6 = llvm.mlir.zero : !llvm.ptr
%7 = llvm.getelementptr %2[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<3 x f64>>
%8 = llvm.getelementptr %1[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<3 x array<2 x f64>>
%9 = llvm.getelementptr %6[6] : (!llvm.ptr) -> !llvm.ptr, f64
%10 = llvm.ptrtoint %9 : !llvm.ptr to i64
%11 = llvm.add %10, %0 : i64
%12 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr
%13 = llvm.ptrtoint %12 : !llvm.ptr to i64
%14 = llvm.sub %0, %5 : i64
%15 = llvm.add %13, %14 : i64
%16 = llvm.urem %15, %0 : i64
%17 = llvm.sub %15, %16 : i64
%18 = llvm.inttoptr %17 : i64 to !llvm.ptr
%19 = llvm.mul %5, %3 : i64
%20 = llvm.mul %19, %4 : i64
%21 = llvm.getelementptr %6[1] : (!llvm.ptr) -> !llvm.ptr, f64
%22 = llvm.ptrtoint %21 : !llvm.ptr to i64
%23 = llvm.mul %20, %22 : i64
"llvm.intr.memcpy"(%18, %7, %23) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
%24 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr
%25 = llvm.ptrtoint %24 : !llvm.ptr to i64
%26 = llvm.add %25, %14 : i64
%27 = llvm.urem %26, %0 : i64
%28 = llvm.sub %26, %27 : i64
%29 = llvm.inttoptr %28 : i64 to !llvm.ptr
%30 = llvm.mul %5, %4 : i64
%31 = llvm.mul %30, %3 : i64
%32 = llvm.mul %31, %22 : i64
"llvm.intr.memcpy"(%29, %8, %32) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
llvm.return
}
}
Then I wanted to perform a transpose on the GPU, so I added linalg.transpose
, but encountered an issue during lowering.
module {
func.func @main() {
%cst = arith.constant dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>
%cst_0 = arith.constant dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>
%0 = bufferization.to_memref %cst : tensor<2x3xf64> to memref<2x3xf64>
%memref = gpu.alloc () : memref<2x3xf64, 1 : i32>
gpu.memcpy %memref, %0 : memref<2x3xf64, 1 : i32>, memref<2x3xf64>
%1 = bufferization.to_memref %cst_0 : tensor<3x2xf64> to memref<3x2xf64>
%memref_1 = gpu.alloc () : memref<3x2xf64, 1 : i32>
gpu.memcpy %memref_1, %1 : memref<3x2xf64, 1 : i32>, memref<3x2xf64>
linalg.transpose ins(%memref : memref<2x3xf64, 1 : i32>) outs(%memref_1 : memref<3x2xf64, 1 : i32>) permutation = [1, 0]
return
}
}
The pass-pipeline I used
--pass-pipeline="builtin.module(one-shot-bufferize{bufferize-function-boundaries=true},convert-linalg-to-affine-loops,func.func(convert-affine-for-to-gpu{gpu-block-dims=1 gpu-t
hread-dims=1}),finalize-memref-to-llvm,gpu-lower-to-nvvm-pipeline{cubin-chip=sm_86 cubin-features=+ptx75 opt-level=3})"
The result
module attributes {gpu.container_module} {
llvm.func @malloc(i64) -> !llvm.ptr
llvm.mlir.global private constant @__constant_3x2xf64(dense<[[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00], [5.000000e+00, 6.000000e+00]]> : tensor<3x2xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<3 x array<2 x f64>>
llvm.mlir.global private constant @__constant_2x3xf64(dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<2 x array<3 x f64>>
llvm.func @main() {
%0 = llvm.mlir.zero : !llvm.ptr
%1 = llvm.mlir.addressof @__constant_2x3xf64 : !llvm.ptr
%2 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%3 = llvm.mlir.constant(0 : index) : i64
%4 = llvm.mlir.addressof @__constant_3x2xf64 : !llvm.ptr
%5 = llvm.mlir.constant(64 : index) : i64
%6 = llvm.mlir.constant(3 : index) : i64
%7 = llvm.mlir.constant(1 : index) : i64
%8 = llvm.mlir.constant(2 : index) : i64
%9 = llvm.getelementptr %1[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x array<3 x f64>>
%10 = llvm.getelementptr %4[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<3 x array<2 x f64>>
%11 = llvm.getelementptr %0[6] : (!llvm.ptr) -> !llvm.ptr, f64
%12 = llvm.ptrtoint %11 : !llvm.ptr to i64
%13 = llvm.add %12, %5 : i64
%14 = llvm.call @malloc(%13) : (i64) -> !llvm.ptr
%15 = llvm.ptrtoint %14 : !llvm.ptr to i64
%16 = llvm.sub %5, %7 : i64
%17 = llvm.add %15, %16 : i64
%18 = llvm.urem %17, %5 : i64
%19 = llvm.sub %17, %18 : i64
%20 = llvm.inttoptr %19 : i64 to !llvm.ptr
%21 = llvm.insertvalue %14, %2[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%22 = llvm.insertvalue %20, %21[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%23 = llvm.insertvalue %3, %22[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%24 = llvm.insertvalue %8, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%25 = llvm.insertvalue %6, %24[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%26 = llvm.insertvalue %6, %25[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%27 = llvm.insertvalue %7, %26[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%28 = builtin.unrealized_conversion_cast %27 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<2x3xf64>
%29 = llvm.mul %7, %8 : i64
%30 = llvm.mul %29, %6 : i64
%31 = llvm.getelementptr %0[1] : (!llvm.ptr) -> !llvm.ptr, f64
%32 = llvm.ptrtoint %31 : !llvm.ptr to i64
%33 = llvm.mul %30, %32 : i64
"llvm.intr.memcpy"(%20, %9, %33) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
%memref = gpu.alloc () : memref<2x3xf64, 1 : i32>
%34 = builtin.unrealized_conversion_cast %memref : memref<2x3xf64, 1 : i32> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
gpu.memcpy %memref, %28 : memref<2x3xf64, 1 : i32>, memref<2x3xf64>
%35 = llvm.call @malloc(%13) : (i64) -> !llvm.ptr
%36 = llvm.ptrtoint %35 : !llvm.ptr to i64
%37 = llvm.add %36, %16 : i64
%38 = llvm.urem %37, %5 : i64
%39 = llvm.sub %37, %38 : i64
%40 = llvm.inttoptr %39 : i64 to !llvm.ptr
%41 = llvm.insertvalue %35, %2[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%42 = llvm.insertvalue %40, %41[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%43 = llvm.insertvalue %3, %42[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%44 = llvm.insertvalue %6, %43[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%45 = llvm.insertvalue %8, %44[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%46 = llvm.insertvalue %8, %45[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%47 = llvm.insertvalue %7, %46[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
%48 = builtin.unrealized_conversion_cast %47 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<3x2xf64>
%49 = llvm.mul %7, %6 : i64
%50 = llvm.mul %49, %8 : i64
%51 = llvm.mul %50, %32 : i64
"llvm.intr.memcpy"(%40, %10, %51) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
%memref_0 = gpu.alloc () : memref<3x2xf64, 1 : i32>
%52 = builtin.unrealized_conversion_cast %memref_0 : memref<3x2xf64, 1 : i32> to !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
gpu.memcpy %memref_0, %48 : memref<3x2xf64, 1 : i32>, memref<3x2xf64>
%53 = llvm.extractvalue %34[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%54 = llvm.extractvalue %34[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%55 = llvm.extractvalue %34[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%56 = llvm.extractvalue %34[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%57 = llvm.extractvalue %34[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%58 = llvm.extractvalue %34[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%59 = llvm.extractvalue %34[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%60 = llvm.extractvalue %52[0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%61 = llvm.extractvalue %52[1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%62 = llvm.extractvalue %52[2] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%63 = llvm.extractvalue %52[3, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%64 = llvm.extractvalue %52[3, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%65 = llvm.extractvalue %52[4, 0] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
%66 = llvm.extractvalue %52[4, 1] : !llvm.struct<(ptr<1>, ptr<1>, i64, array<2 x i64>, array<2 x i64>)>
gpu.launch_func @main_kernel::@main_kernel blocks in (%6, %7, %7) threads in (%8, %7, %7) : i64 args(%53 : !llvm.ptr<1>, %54 : !llvm.ptr<1>, %55 : i64, %56 : i64, %57 : i64, %58 : i64, %59 : i64, %60 : !llvm.ptr<1>, %61 : !llvm.ptr<1>, %62 : i64, %63 : i64, %64 : i64, %65 : i64, %66 : i64)
llvm.return
}
gpu.binary @main_kernel [#gpu.object<#nvvm.target<O = 3, chip = "sm_86", features = "+ptx75">, "P\EDU\BA\01\00\10\00\A0\0E\00\00\00\00\00\00\02\00\01\01@\00\00\00(\0C\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\01\00V\00\00\00\00\00\00\00\00\00\00\00\11\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00}\00\00\00\00\00\00\00\00\00\00\00\80\0B\00\00\00\00\00\00\80\08\00\00\00\00\00\00V\05V\00@\008\00\03\00@\00\0C\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.main_kernel\00.nv.info.main_kernel\00.nv.shared.main_kernel\00.nv.constant0.main_kernel\00.rel.nv.constant0.main_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00.nv.rel.action\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.info\00.text.main_kernel\00.nv.info.main_kernel\00.nv.shared.main_kernel\00.rel.nv.constant0.main_kernel\00.nv.constant0.main_kernel\00.debug_frame\00.rel.debug_frame\00.rela.debug_frame\00.nv.callgraph\00.nv.prototype\00.nv.rel.action\00main_kernel|\FF\FF\FF\FF\0F\0C\81\80\80(\00\08\FF\81\80(\08\81\80\80(\00\00\00\FF\FF\FF\FF4\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\80\01\00\00\00\00\00\00\04\04\00\00\00\044\00\00\00\0C\81\80\80(\00\04\FC\FF\FF?\00\00\00\00\00\00\00\04/\08\00\06\00\00\00\0A\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\04\11\08\00\06\00\00\00\00\00\00\00\04\12\08\00\06\00\00\00\00\00\00\00\047\04\00}\00\00\00\015\00\00\04\0A\08\00\02\00\00\00`\01p\00\03\19p\00\04\17\0C\00\00\00\00\00\0D\00h\00\00\F0!\00\04\17\0C\00\00\00\00\00\0C\00`sz\01\00\00\0A\00\00\00\0F\00\00\00\E4\0F\00\19y\07\00\00\00\00\00\00!\00\00\00\22\0E\00\02x\02\00\18\00\00\00\00\0F\00\00\00\E2\0F\00\B9z\04\00\00F\00\00\00\0A\00\00\00\E4\0F\00\19y\05\00\00\00\00\00\00%\00\00\00d\0E\00%v\02\07\00Z\00\00\02\00\8E\07\00\CC\1F\00%x\02\05\08\00\00\00\02\00\8E\07\00\CC/\00\81y\02\02\04\00\00\00\00\1B\1E\0C\00\A2\0E\00%x\04\05\02\00\00\00\FF\00\8E\07\00\CA\0F\00\12r\07\04\07\00\00\00\FF\FC\8E\07\00\C8\0F\00\11z\04\07\00h\00\00\FF\18\80\07\00\C8\0F\00\11z\05\07\00i\00\00\05\1C\0F\00\00\CA\0F\00\86y\00\04\02\00\00\00\04\1B\10\0C\00\E2O\00My\00\00\00\00\00\00\00\00\80\03\00\EA\0F\00Gy\00\00\F0\FF\FF\FF\FF\FF\83\03\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18y\00\00\00\00\00\00\00\00\00\00\00\C0\0F\00\18yp\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00)\00\00\00\00\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00p\03\00\00\00\00\00\000\00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00D\00\00\00\00\00\00p@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A0\03\00\00\00\00\00\00\18\01\00\00\00\00\00\00\03\00\00\00\0B\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\00\00\00\01\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\B8\04\00\00\00\00\00\00 \00\00\00\00\00\00\00\03\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\F4\00\00\00\0B\00\00p\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\D8\04\00\00\00\00\00\00\10\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\B5\00\00\00\09\00\00\00@\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\E8\04\00\00\00\00\00\00\10\00\00\00\00\00\00\00\03\00\00\00\04\00\00\00\08\00\00\00\00\00\00\00\10\00\00\00\00\00\00\00p\00\00\00\01\00\00\00B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\F8\04\00\00\00\00\00\00\D0\01\00\00\00\00\00\00\00\00\00\00\0B\00\00\00\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\002\00\00\00\01\00\00\00\06\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\07\00\00\00\00\00\00\80\01\00\00\00\00\00\00\03\00\00\00\06\00\00\0A\80\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\06\00\00\00\05\00\00\00\80\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\05\00\00\00\F8\04\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\88\03\00\00\00\00\00\00\88\03\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\00\00\05\00\00\00\80\0B\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\A8\00\00\00\00\00\00\00\08\00\00\00\00\00\00\00\01\00\01\01P\00\00\00\E8\01\00\00\00\00\00\00\E6\01\00\00@\00\00\00\05\00\07\00V\00\00\00\00\00\00\00\00\00\00\00\11 \00\00\00\00\00\00\00\00\00\00\00\00\00\00w\04\00\00\00\00\00\00H\00\00\00\00\00\00\00\00\00\00\00\00\00\00\000\0A//\03\00\F3\1E\0A.version 7.5\0A.target sm_86\0A.address_size 64\0A1\00\F8-isible .entry main_kernel(\0A.param .u64 .ptr .global .align 1/\00\11_-\00?_0,7\00\22\1A17\00\0E!\00\1F2!\00\0D\1F3!\00\0D\1F4!\00\0D\1F5!\00\0D\1F6\DC\00#\1F77\00#\1F8\8F\00\0D\1F9!\00\0D/10\22\00\0E\0Fx\01\0E\1F1y\01\0E\F4\1913\0A)\0A.maxntid 2, 1, 1\0A{\0A.reg .b32 %r<3>;\11\00\9564 %rd<12\13\00\10f\13\000fd<\12\00a\0Amov.u5\00\F2\001, %ctaid.x;\0Ald\86\00\22.u@\00O1, [\8C\00\00'];A\00R2, %t?\000cvt:\00\03\19\00\10d\1A\00\C4r2;\0Amul.wide\18\00\223,7\00\9224;\0Aadd.sm\00#4,s\00\00#\00\0E6\00\225,\AE\00\1985\00&6,;\00\115\C1\00\02\F4\01\04\EF\00\00\C2\00\00'\00\1E]\DD\00\1F7\DD\00\02.8]{\00\138{\00T2;\0Aor\\\01%9,\1B\00td2;\0Ashl\1A\00310,!\00\193\AC\00311,x\00\01%\00H;\0Ast\AE\00\11[#\00!],\B7\00\B0;\0Aret;\0A\0A}\0A\00\00\00">]
}
I performed lowering, but the code still contains builtin.unrealized_conversion_cast
, gpu.alloc
, and gpu.memcpy
.
I have two questions:
- Can this issue be resolved by modifying the pass pipeline? If not, what should I do?
- I applied the
gpu-lower-to-nvvm
pipeline — how is it possible that GPU dialect operations are still present
Thank you for reading!