Lower gpu dialect failed

Hi, I am trying to lowering a gpu dialect, I reference Error at lower gpu dialect to llvmir - #3 by zhangxj19, but I find --gpu-to-cubin isn’t in the pipelines, I am using llvm@79786c4d23f1fd7af438e4fd4e33ec109626bee4 and the last commit is at Mon Jul 10 16:01:27 2023, should I change my llvm version or use other pipelines? I am trying to lower this code

module attributes {gpu.container_module} {
    gpu.module @kernels {
        gpu.func @vec_add_kernel(%a: memref<?xf32>, %b: memref<?xf32>, %c: memref<?xf32>) attributes {gpu.kernel} {
            %tid = gpu.thread_id x
            %a_val = memref.load %a[%tid] : memref<?xf32>
            %b_val = memref.load %b[%tid] : memref<?xf32>
            %res = arith.addf %a_val, %b_val : f32
            memref.store %res, %c[%tid] : memref<?xf32>
            gpu.return
        }
    }
    func.func @main() {
        %sz = arith.constant 32 : index
        %a = memref.alloc(%sz) : memref<?xf32>
        %b = memref.alloc(%sz) : memref<?xf32>
        %c = memref.alloc(%sz) : memref<?xf32>
        %grid_size = arith.constant 1 : index
        %c1 = arith.constant 1 : index
        %dim_size = arith.constant 32 : index
        gpu.launch_func @kernels::@vec_add_kernel
        blocks in (%grid_size, %c1, %c1)
        threads in (%dim_size, %c1, %c1)
        args(%a:memref<?xf32>, %b:memref<?xf32>, %c:memref<?xf32>)
        return
    }    
}

use the pipelines

test-lower-gpu:
	@${BUDDY_OPT} ${INPUT} \
		--gpu-kernel-outlining \
		--convert-gpu-to-nvvm \
		--llvm-optimize-for-nvvm-target \
		-sparsification --sparse-tensor-conversion \
		-arith-bufferize \
		-arith-expand \
		-mem2reg \
		-affine-expand-index-ops \
		-tensor-bufferize \
		-linalg-bufferize \
		-func-bufferize -buffer-deallocation \
		-lower-affine -memref-expand \
		-convert-linalg-to-loops -convert-vector-to-scf  -lower-affine -convert-scf-to-cf -convert-scf-to-openmp \
		-convert-linalg-to-llvm -convert-openmp-to-llvm -convert-vector-to-llvm -convert-index-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -convert-func-to-llvm \
		-reconcile-unrealized-casts -o ./log.mlir

the error occur in the last reconcile-unrealized-casts, it seems can’t convert the grid_size from i64 to index

module attributes {gpu.container_module, llvm.data_layout = ""} {
  llvm.func @free(!llvm.ptr)
  llvm.func @malloc(i64) -> !llvm.ptr
  gpu.module @kernels {
    llvm.func @vec_add_kernel(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
      %0 = nvvm.read.ptx.sreg.tid.x : i32
      %1 = llvm.sext %0 : i32 to i64
      %2 = llvm.getelementptr %arg1[%1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %3 = llvm.load %2 : !llvm.ptr -> f32
      %4 = llvm.getelementptr %arg6[%1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      %5 = llvm.load %4 : !llvm.ptr -> f32
      %6 = llvm.fadd %3, %5  : f32
      %7 = llvm.getelementptr %arg11[%1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
      llvm.store %6, %7 : f32, !llvm.ptr
      llvm.return
    }
  }
  llvm.func @main() {
    %0 = llvm.mlir.constant(0 : index) : i64
    %1 = llvm.mlir.constant(32 : index) : i64
    %2 = llvm.mlir.constant(1 : index) : i64
    %3 = builtin.unrealized_conversion_cast %2 : i64 to index
    %4 = builtin.unrealized_conversion_cast %1 : i64 to index
    %5 = llvm.mlir.null : !llvm.ptr
    %6 = llvm.getelementptr %5[32] : (!llvm.ptr) -> !llvm.ptr, f32
    %7 = llvm.ptrtoint %6 : !llvm.ptr to i64
    %8 = llvm.call @malloc(%7) : (i64) -> !llvm.ptr
    %9 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
    %10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %11 = llvm.insertvalue %8, %10[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %12 = llvm.insertvalue %0, %11[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %13 = llvm.insertvalue %1, %12[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %14 = llvm.insertvalue %2, %13[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %15 = builtin.unrealized_conversion_cast %14 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<?xf32>
    %16 = llvm.mlir.null : !llvm.ptr
    %17 = llvm.getelementptr %16[32] : (!llvm.ptr) -> !llvm.ptr, f32
    %18 = llvm.ptrtoint %17 : !llvm.ptr to i64
    %19 = llvm.call @malloc(%18) : (i64) -> !llvm.ptr
    %20 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
    %21 = llvm.insertvalue %19, %20[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %22 = llvm.insertvalue %19, %21[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %23 = llvm.insertvalue %0, %22[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %24 = llvm.insertvalue %1, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %25 = llvm.insertvalue %2, %24[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %26 = builtin.unrealized_conversion_cast %25 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<?xf32>
    %27 = llvm.mlir.null : !llvm.ptr
    %28 = llvm.getelementptr %27[32] : (!llvm.ptr) -> !llvm.ptr, f32
    %29 = llvm.ptrtoint %28 : !llvm.ptr to i64
    %30 = llvm.call @malloc(%29) : (i64) -> !llvm.ptr
    %31 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
    %32 = llvm.insertvalue %30, %31[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %33 = llvm.insertvalue %30, %32[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %34 = llvm.insertvalue %0, %33[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %35 = llvm.insertvalue %1, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %36 = llvm.insertvalue %2, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> 
    %37 = builtin.unrealized_conversion_cast %36 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<?xf32>
    gpu.launch_func  @kernels::@vec_add_kernel blocks in (%3, %3, %3) threads in (%4, %3, %3) args(%15 : memref<?xf32>, %26 : memref<?xf32>, %37 : memref<?xf32>)
    llvm.call @free(%30) : (!llvm.ptr) -> ()
    llvm.call @free(%19) : (!llvm.ptr) -> ()
    llvm.call @free(%8) : (!llvm.ptr) -> ()
    llvm.return
  }
}
error: failed to legalize operation 'builtin.unrealized_conversion_cast' that was explicitly marked illegal
        %grid_size = arith.constant 1 : index

how should I lowering my code to gpu target. Thanks for your help!

I find another discourse about this problem Conversion of GPU kernel failed because 'gpu-to-cubin' pipeline was not found, I rebuild llvm with -DMLIR_ENABLE_CUDA_RUNNER=ON and find -gpu-to-cubin. then I pass the -gpu-to-cubin gpu-to-llvm to the pipelines and fix the unrealized_conversion_cast :grinning:

Hi, I am facing the same situation as yours. May I know where exactly did you put -gpu-to-cubin gpu-to-llvm in the pipelines?

I tried put them at the end or before -reconcile-unrealized-casts but none of them works

Thank you!

There is a “reference pipeline” as an example that is accessible with the test-lower-to-nvvm pass, implemented here: https://github.com/llvm/llvm-project/tree/main/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp

3 Likes

Got it, thank you