Hi, I am trying to lowering a gpu dialect, I reference Error at lower gpu dialect to llvmir - #3 by zhangxj19, but I find --gpu-to-cubin isn’t in the pipelines, I am using llvm@79786c4d23f1fd7af438e4fd4e33ec109626bee4 and the last commit is at Mon Jul 10 16:01:27 2023, should I change my llvm version or use other pipelines? I am trying to lower this code
module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @vec_add_kernel(%a: memref<?xf32>, %b: memref<?xf32>, %c: memref<?xf32>) attributes {gpu.kernel} {
%tid = gpu.thread_id x
%a_val = memref.load %a[%tid] : memref<?xf32>
%b_val = memref.load %b[%tid] : memref<?xf32>
%res = arith.addf %a_val, %b_val : f32
memref.store %res, %c[%tid] : memref<?xf32>
gpu.return
}
}
func.func @main() {
%sz = arith.constant 32 : index
%a = memref.alloc(%sz) : memref<?xf32>
%b = memref.alloc(%sz) : memref<?xf32>
%c = memref.alloc(%sz) : memref<?xf32>
%grid_size = arith.constant 1 : index
%c1 = arith.constant 1 : index
%dim_size = arith.constant 32 : index
gpu.launch_func @kernels::@vec_add_kernel
blocks in (%grid_size, %c1, %c1)
threads in (%dim_size, %c1, %c1)
args(%a:memref<?xf32>, %b:memref<?xf32>, %c:memref<?xf32>)
return
}
}
use the pipelines
test-lower-gpu:
@${BUDDY_OPT} ${INPUT} \
--gpu-kernel-outlining \
--convert-gpu-to-nvvm \
--llvm-optimize-for-nvvm-target \
-sparsification --sparse-tensor-conversion \
-arith-bufferize \
-arith-expand \
-mem2reg \
-affine-expand-index-ops \
-tensor-bufferize \
-linalg-bufferize \
-func-bufferize -buffer-deallocation \
-lower-affine -memref-expand \
-convert-linalg-to-loops -convert-vector-to-scf -lower-affine -convert-scf-to-cf -convert-scf-to-openmp \
-convert-linalg-to-llvm -convert-openmp-to-llvm -convert-vector-to-llvm -convert-index-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -convert-func-to-llvm \
-reconcile-unrealized-casts -o ./log.mlir
the error occur in the last reconcile-unrealized-casts, it seems can’t convert the grid_size from i64 to index
module attributes {gpu.container_module, llvm.data_layout = ""} {
llvm.func @free(!llvm.ptr)
llvm.func @malloc(i64) -> !llvm.ptr
gpu.module @kernels {
llvm.func @vec_add_kernel(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: i64, %arg9: i64, %arg10: !llvm.ptr, %arg11: !llvm.ptr, %arg12: i64, %arg13: i64, %arg14: i64) attributes {gpu.kernel, nvvm.kernel} {
%0 = nvvm.read.ptx.sreg.tid.x : i32
%1 = llvm.sext %0 : i32 to i64
%2 = llvm.getelementptr %arg1[%1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%3 = llvm.load %2 : !llvm.ptr -> f32
%4 = llvm.getelementptr %arg6[%1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%5 = llvm.load %4 : !llvm.ptr -> f32
%6 = llvm.fadd %3, %5 : f32
%7 = llvm.getelementptr %arg11[%1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
llvm.store %6, %7 : f32, !llvm.ptr
llvm.return
}
}
llvm.func @main() {
%0 = llvm.mlir.constant(0 : index) : i64
%1 = llvm.mlir.constant(32 : index) : i64
%2 = llvm.mlir.constant(1 : index) : i64
%3 = builtin.unrealized_conversion_cast %2 : i64 to index
%4 = builtin.unrealized_conversion_cast %1 : i64 to index
%5 = llvm.mlir.null : !llvm.ptr
%6 = llvm.getelementptr %5[32] : (!llvm.ptr) -> !llvm.ptr, f32
%7 = llvm.ptrtoint %6 : !llvm.ptr to i64
%8 = llvm.call @malloc(%7) : (i64) -> !llvm.ptr
%9 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%11 = llvm.insertvalue %8, %10[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%12 = llvm.insertvalue %0, %11[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%13 = llvm.insertvalue %1, %12[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%14 = llvm.insertvalue %2, %13[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%15 = builtin.unrealized_conversion_cast %14 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<?xf32>
%16 = llvm.mlir.null : !llvm.ptr
%17 = llvm.getelementptr %16[32] : (!llvm.ptr) -> !llvm.ptr, f32
%18 = llvm.ptrtoint %17 : !llvm.ptr to i64
%19 = llvm.call @malloc(%18) : (i64) -> !llvm.ptr
%20 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%21 = llvm.insertvalue %19, %20[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%22 = llvm.insertvalue %19, %21[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%23 = llvm.insertvalue %0, %22[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%24 = llvm.insertvalue %1, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%25 = llvm.insertvalue %2, %24[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%26 = builtin.unrealized_conversion_cast %25 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<?xf32>
%27 = llvm.mlir.null : !llvm.ptr
%28 = llvm.getelementptr %27[32] : (!llvm.ptr) -> !llvm.ptr, f32
%29 = llvm.ptrtoint %28 : !llvm.ptr to i64
%30 = llvm.call @malloc(%29) : (i64) -> !llvm.ptr
%31 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%32 = llvm.insertvalue %30, %31[0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%33 = llvm.insertvalue %30, %32[1] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%34 = llvm.insertvalue %0, %33[2] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%35 = llvm.insertvalue %1, %34[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%36 = llvm.insertvalue %2, %35[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
%37 = builtin.unrealized_conversion_cast %36 : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> to memref<?xf32>
gpu.launch_func @kernels::@vec_add_kernel blocks in (%3, %3, %3) threads in (%4, %3, %3) args(%15 : memref<?xf32>, %26 : memref<?xf32>, %37 : memref<?xf32>)
llvm.call @free(%30) : (!llvm.ptr) -> ()
llvm.call @free(%19) : (!llvm.ptr) -> ()
llvm.call @free(%8) : (!llvm.ptr) -> ()
llvm.return
}
}
error: failed to legalize operation 'builtin.unrealized_conversion_cast' that was explicitly marked illegal
%grid_size = arith.constant 1 : index
how should I lowering my code to gpu target. Thanks for your help!