Help lowering GPU modules to LLVM

I’m looking for help in getting my conversion pipeline for GPU + SCF to LLVM working. It’s been working fine so far, but I ran into some new programs now where it’s not working, and am looking to fix what’s going wrong.

I’m trying to lower the following snippet of code. There are various idiosyncrasies in this code like loading dimension values from memrefs rather than using memref.dim – please ignore things like this, there is a good reason it is done this way.

#map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
#map2 = affine_map<(d0, d1, d2) -> (256, d1 - d2)>
module attributes {gpu.container_module} {
  func.func @legateMLIRKernel41(%arg0: memref<?x?xf64>, %arg1: memref<f64>, %arg2: memref<2xindex>, %arg3: memref<0xindex>) attributes {llvm.emit_c_interface} {
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0x7FF0000000000000 : f64
    %0 = memref.load %arg1[] : memref<f64>
    %1 = memref.load %arg2[%c0] : memref<2xindex>
    %2 = memref.load %arg2[%c1] : memref<2xindex>
    %3 = arith.muli %1, %2 : index
    %4 = gpu.wait async
    %memref, %asyncToken = gpu.alloc async [%4] () : memref<f64>
    %alloca = memref.alloca() : memref<f64>
    memref.store %cst, %alloca[] : memref<f64>
    %5 = gpu.memcpy async [%asyncToken] %memref, %alloca : memref<f64>, memref<f64>
    %c1_0 = arith.constant 1 : index
    %6 = affine.apply #map(%3)[%c0, %c256]
    %c256_1 = arith.constant 256 : index
    %7 = affine.apply #map(%c256_1)[%c0, %c1]
    %8 = gpu.launch_func async [%5] @legateMLIRKernel41_kernel::@legateMLIRKernel41_kernel blocks in (%6, %c1_0, %c1_0) threads in (%7, %c1_0, %c1_0) args(%3 : index, %2 : index, %arg0 : memref<?x?xf64>, %memref : memref<f64>)
    %alloca_2 = memref.alloca() : memref<f64>
    %9 = gpu.memcpy async [%8] %alloca_2, %memref : memref<f64>, memref<f64>
    %10 = memref.load %alloca_2[] : memref<f64>
    %11 = arith.minf %0, %10 : f64
    memref.store %11, %arg1[] : memref<f64>
    return
  }
  gpu.module @legateMLIRKernel41_kernel {
    gpu.func @legateMLIRKernel41_kernel(%arg0: index, %arg1: index, %arg2: memref<?x?xf64>, %arg3: memref<f64>) kernel {
      %0 = gpu.block_id  x
      %1 = gpu.block_id  y
      %2 = gpu.block_id  z
      %3 = gpu.thread_id  x
      %4 = gpu.thread_id  y
      %5 = gpu.thread_id  z
      %6 = gpu.grid_dim  x
      %7 = gpu.grid_dim  y
      %8 = gpu.grid_dim  z
      %9 = gpu.block_dim  x
      %10 = gpu.block_dim  y
      %11 = gpu.block_dim  z
      cf.br ^bb1
    ^bb1:  // pred: ^bb0
      %c256 = arith.constant 256 : index
      %c0 = arith.constant 0 : index
      %c1 = arith.constant 1 : index
      %cst = arith.constant 0x7FF0000000000000 : f64
      %12 = affine.apply #map1(%0)[%c256, %c0]
      %13 = affine.min #map2(%c256, %arg0, %12)
      %14 = affine.apply #map1(%3)[%c1, %c0]
      %15 = arith.cmpi slt, %14, %13 : index
      %16 = scf.if %15 -> (f64) {
        %20 = arith.addi %14, %12 : index
        %21 = arith.remsi %20, %arg1 : index
        %22 = arith.divsi %20, %arg1 : index
        %23 = memref.load %arg2[%22, %21] : memref<?x?xf64>
        scf.yield %23 : f64
      } else {
        scf.yield %cst : f64
      }
      %17 = gpu.all_reduce  min %16 uniform {
      } : (f64) -> f64
      %18 = gpu.thread_id  x
      %c0_0 = arith.constant 0 : index
      %19 = arith.cmpi eq, %18, %c0_0 : index
      scf.if %19 {
        %20 = memref.atomic_rmw minf %17, %arg3[] : (f64, memref<f64>) -> f64
      }
      gpu.return
    }
  }
}

And my conversion pipeline looks like this, which is mostly adapted from https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp.

    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertSCFToCFPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createLowerGpuOpsToNVVMOpsPass());

    pm.addPass(mlir::createConvertSCFToCFPass());
    pm.addPass(mlir::memref::createExpandStridedMetadataPass());
    pm.addPass(mlir::createLowerAffinePass());
    pm.addPass(mlir::createConvertIndexToLLVMPass());
    pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
    pm.addPass(mlir::arith::createArithExpandOpsPass());
    pm.addPass(mlir::createConvertComplexToLLVMPass());
    pm.addPass(mlir::createConvertMathToLLVMPass());
    pm.addPass(mlir::createConvertMathToLibmPass());
    pm.addPass(mlir::createConvertFuncToLLVMPass());

    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertNVGPUToNVVMPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::NVVM::createOptimizeForTargetPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createGpuSerializeToCubinPass("nvptx64-nvidia-cuda", "sm_60", "+ptx60"));
    pm.addPass(mlir::createGpuToLLVMConversionPass());

    pm.addPass(mlir::createReconcileUnrealizedCastsPass());

However, running this pipeline gives me a variety of unrealized_conversion_casts around memref and index operations that I don’t understand how to debug further. This pass pipeline is a reordering of what I originally had, which also didn’t work. I’m not sure which of these passes are necessary / unnecessary, which of them need to be ordered and which don’t, etc. Any help would be appreciated!

As a meta-point, I think the conversion infrastructure has been the most opaque portion of MLIR for me to understand / use productively (so far). It’s unclear to me how to decide the magic order that these conversion passes need to be run in, as evidenced by a similar question here (Help lowering affine loop to OpenMP).

I find a same problem, I try to lower this code

%26 = gpu.launch async [%23] blocks(%arg1, %arg2, %arg3) in (%arg7 = %25, %arg8 = %c1_37, %arg9 = %c1_37) threads(%arg4, %arg5, %arg6) in (%arg10 = %24, %arg11 = %c1_37, %arg12 = %c1_37) {
  %29 = gpu.thread_id  x
  %30 = gpu.block_id  x
  %31 = arith.muli %30, %c8 : index
  %32 = arith.divsi %29, %c4 : index
  %33 = arith.remsi %29, %c4 : index
  %34 = arith.addi %31, %32 : index
  %35 = arith.cmpi slt, %34, %22 : index
  scf.if %35 {
    %36 = memref.load %memref_31[%34] : memref<?xi32>
    %37 = arith.index_cast %36 : i32 to index
    %38 = memref.load %memref[%37] : memref<?xindex>
    %39 = arith.addi %37, %c1_37 : index
    %40 = memref.load %memref[%39] : memref<?xindex>
    %41 = arith.addi %38, %33 : index
    scf.for %arg13 = %41 to %40 step %c4 {
      %42 = memref.load %memref_25[%arg13] : memref<?xi32>
      %43 = memref.load %memref_27[%arg13] : memref<?xf32>
      %44 = arith.index_cast %42 : i32 to index
      %45 = memref.atomic_rmw addf %43, %memref_29[%44] : (f32, memref<?xf32>) -> f32
    }
  }
  gpu.terminator
}

and get a error

spmspv.mlir:98:13: error: cannot be converted to LLVM IR: missing `LLVMTranslationDialectInterface` registration for dialect for op: builtin.unrealized_conversion_cast
      %26 = gpu.launch async [%23] blocks(%arg1, %arg2, %arg3) in (%arg7 = %25, %arg8 = %c1_37, %arg9 = %c1_37) threads(%arg4, %arg5, %arg6) in (%arg10 = %24, %arg11 = %c1_37, %arg12 = %c1_37) {
            ^
spmspv.mlir:98:13: note: see current operation: %0 = "builtin.unrealized_conversion_cast"(%arg1) : (i64) -> index

but when I delete the memref.atomic_rmw operation,the code can be lower to llvm,and when I use memref.atomic_rmw in cpu, it can also lower to llvm. So maybe GPU doesn’t support memref.atomic_rmw? But I can’t find atomic instruction in gpu dialect or nvvm/nvgpu dialect. or we are missing some translation pass?

I make a mistake, the atomic_rmw seems not the reason why (i64) → index, Maybe when I delete the atomic_rmw, the gpu kernel make nothing effect and be optimized by the compiler. in my simple question, I found change the order to the following can help(I am using a makefile and use these pipelines)

-convert-linalg-to-loops -convert-vector-to-scf  -lower-affine -convert-scf-to-cf -convert-scf-to-openmp \
-convert-gpu-to-nvvm --gpu-to-cubin \
-convert-linalg-to-llvm -convert-openmp-to-llvm -convert-vector-to-llvm -convert-index-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -convert-func-to-llvm --gpu-to-llvm\

Maybe you can change the to-llvm pass after the convert pass and put the -convert-gpu-to-nvvm --gpu-to-cubin at the end of convert pass and --gpu-to-llvm at the end of to-llvm pass

@lyc200150, your pass pipeline does not work for me, and I get a similar error as before. Unrelated, i’m not sure why your pipeline is trying to lower scf loops to both openmp and gpu…

@nicolasvasilache added recently a reference pipeline here in case it can help.

I tried extracting Nicolas’s pipeline and it still results in unrealized conversion casts on this program, which is very similar to the original one before the kernel outlining pass has been applied:

#map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
#map2 = affine_map<(d0, d1, d2) -> (256, d1 - d2)>
module {
  func.func @legateMLIRKernel39(%arg0: memref<?x?xf64>, %arg1: memref<f64>, %arg2: memref<2xindex>, %arg3: memref<0xindex>) attributes {llvm.emit_c_interface} {
    %c256 = arith.constant 256 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0x7FF0000000000000 : f64
    %0 = memref.load %arg1[] : memref<f64>
    %1 = memref.load %arg2[%c0] : memref<2xindex>
    %2 = memref.load %arg2[%c1] : memref<2xindex>
    %3 = arith.muli %1, %2 : index
    %4 = gpu.wait async
    %memref, %asyncToken = gpu.alloc async [%4] () : memref<f64>
    %alloca = memref.alloca() : memref<f64>
    memref.store %cst, %alloca[] : memref<f64>
    %5 = gpu.memcpy async [%asyncToken] %memref, %alloca : memref<f64>, memref<f64>
    %c1_0 = arith.constant 1 : index
    %6 = affine.apply #map(%3)[%c0, %c256]
    %c256_1 = arith.constant 256 : index
    %7 = affine.apply #map(%c256_1)[%c0, %c1]
    %8 = gpu.launch async [%5] blocks(%arg4, %arg5, %arg6) in (%arg10 = %6, %arg11 = %c1_0, %arg12 = %c1_0) threads(%arg7, %arg8, %arg9) in (%arg13 = %7, %arg14 = %c1_0, %arg15 = %c1_0) {
      %c256_3 = arith.constant 256 : index
      %c0_4 = arith.constant 0 : index
      %c1_5 = arith.constant 1 : index
      %cst_6 = arith.constant 0x7FF0000000000000 : f64
      %12 = affine.apply #map1(%arg4)[%c256_3, %c0_4]
      %13 = affine.min #map2(%c256_3, %3, %12)
      %14 = affine.apply #map1(%arg7)[%c1_5, %c0_4]
      %15 = arith.cmpi slt, %14, %13 : index
      %16 = scf.if %15 -> (f64) {
        %20 = arith.addi %14, %12 : index
        %21 = arith.remsi %20, %2 : index
        %22 = arith.divsi %20, %2 : index
        %23 = memref.load %arg0[%22, %21] : memref<?x?xf64>
        scf.yield %23 : f64
      } else {
        scf.yield %cst_6 : f64
      }
      %17 = gpu.all_reduce  min %16 uniform {
      } : (f64) -> f64
      %18 = gpu.thread_id  x
      %c0_7 = arith.constant 0 : index
      %19 = arith.cmpi eq, %18, %c0_7 : index
      scf.if %19 {
        %20 = memref.atomic_rmw minf %17, %memref[] : (f64, memref<f64>) -> f64
      }
      gpu.terminator
    } {SCFToGPU_visited}
    %alloca_2 = memref.alloca() : memref<f64>
    %9 = gpu.memcpy async [%8] %alloca_2, %memref : memref<f64>, memref<f64>
    %10 = memref.load %alloca_2[] : memref<f64>
    %11 = arith.minf %0, %10 : f64
    memref.store %11, %arg1[] : memref<f64>
    return
  }
}

My code is taken from the diff but doesn’t bother with the index bit width things, as I’m fine with 64 bit indices for now.

    // Passes for the host side...
    pm.addPass(mlir::createGpuKernelOutliningPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createConvertSCFToCFPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createConvertMathToLLVMPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createConvertComplexToLLVMPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::memref::createExpandStridedMetadataPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createLowerAffinePass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createFinalizeMemRefToLLVMConversionPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createConvertFuncToLLVMPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createConvertIndexToLLVMPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createArithToLLVMConversionPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());

    // GPU module specific things...
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertSCFToCFPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertMathToLLVMPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertComplexToLLVMPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::memref::createExpandStridedMetadataPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createLowerAffinePass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createFinalizeMemRefToLLVMConversionPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertFuncToLLVMPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertIndexToLLVMPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createLowerGpuOpsToNVVMOpsPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertNVGPUToNVVMPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertSCFToCFPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::NVVM::createOptimizeForTargetPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createGpuToLLVMConversionPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createCanonicalizerPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createCSEPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createReconcileUnrealizedCastsPass());
    pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createGpuSerializeToCubinPass("nvptx64-nvidia-cuda", "sm_60", "+ptx60"));

    // Host post-GPU module specific things...
    pm.addPass(mlir::createConvertIndexToLLVMPass());
    pm.addPass(mlir::createGpuToLLVMConversionPass());
    pm.addPass(mlir::createConvertFuncToLLVMPass());
    pm.addPass(mlir::createCanonicalizerPass());
    pm.addPass(mlir::createCSEPass());
    pm.addPass(mlir::createReconcileUnrealizedCastsPass());

Unrelated, despite that diff landing, I can’t find the source files in the LLVM repo? https://github.com/llvm/llvm-project/tree/main/mlir/test/lib/Dialect.

I also think that adding convertMathToLLVMPass before the various NVVM passes is a mistake, as this doesn’t let math operations that have NVVM implementations get lowered correctly.

In general you can figure through git log -- <path> what happened. Another way is looking at some function of interest and trying to git grep in the codebase to find if it got refactored somewhere else.

In this case it moved to: https://github.com/llvm/llvm-project/tree/main/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp

I tried the test-lower-to-nvvm pass as-is today as well, and it also didn’t work for the above example.

Can you clarify what you mean by “it does not work”?

Right now for me it looks like this:

$ bin/mlir-opt --test-lower-to-nvvm /tmp/gpu.mlir  | bin/mlir-translate --mlir-to-llvmir
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"

%0 = type { i64, i64, ptr, ptr, i64, i64, i64, i64, i64, ptr, ptr, i64 }

@legateMLIRKernel39_kernel_legateMLIRKernel39_kernel_kernel_name = internal constant [26 x i8] c"legateMLIRKernel39_kernel\00"
@legateMLIRKernel39_kernel_gpubin_cst = internal constant [21280 x i8] c"\7FELF\02\01\013\07\00\00\00\00\00\00\00\02\00\BE\00y\00\00\00\00\00\00\00\00\00\00\00@R\00\00\00\00\00\00\80M\00\00\00\00\00\00Y\05P\00@\008\00\04\00@\00\13\00\01\00\00.shstrtab\00.strtab\00.symtab\00.symtab_shndx\00.nv.uft.entry\00.nv.i ..."
declare ptr @malloc(i64)

declare void @free(ptr)

define void @legateMLIRKernel39(ptr %0, ptr %1, i64 %2, i64 %3, i64 %4, i64 %5, i64 %6, ptr %7, ptr %8, i64 %9, ptr %10, ptr %11, i64 %12, i64 %13, i64 %14, ptr %15, ptr %16, i64 %17, i64 %18, i64 %19) {
  %21 = load double, ptr %8, align 8
  %22 = load i64, ptr %11, align 4
  %23 = getelementptr i64, ptr %11, i32 1
  %24 = load i64, ptr %23, align 4
  %25 = mul i64 %22, %24
  %26 = call ptr @mgpuStreamCreate()
  %27 = call ptr @mgpuMemAlloc(i64 ptrtoint (ptr getelementptr (double, ptr null, i32 1) to i64), ptr %26)
  %28 = alloca double, i64 1, align 8
  store double 0x7FF0000000000000, ptr %28, align 8
...
  ret void
}

define void @_mlir_ciface_legateMLIRKernel39(ptr %0, ptr %1, ptr %2, ptr %3) {
  %5 = load { ptr, ptr, i64, [2 x i64], [2 x i64] }, ptr %0, align 8
  %6 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %5, 0
  ...
  call void @legateMLIRKernel39(ptr %6, ptr %7, i64 %8, i64 %9, i64 %10, i64 %11, i64 %12, ptr %14, ptr %15, i64 %16, ptr %18, ptr %19, i64 %20, i64 %21, i64 %22, ptr %24, ptr %25, i64 %26, i64 %27, i64 %28)
  ret void
}

declare ptr @mgpuStreamCreate()

declare ptr @mgpuMemAlloc(i64, ptr)

declare void @mgpuMemcpy(ptr, ptr, i64, ptr)

declare ptr @mgpuModuleLoad(ptr)

declare ptr @mgpuModuleGetFunction(ptr, ptr)

declare void @mgpuLaunchKernel(ptr, i64, i64, i64, i64, i64, i64, i32, ptr, ptr, ptr)

declare void @mgpuModuleUnload(ptr)

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare double @llvm.minnum.f64(double, double) #0

attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }

!llvm.module.flags = !{!0}

!0 = !{i32 2, !"Debug Info Version", i32 3}

Hmm, I get

(legate) rohany@sean-dgx2:/local/home/rohany/llvm-project/build$ ./bin/mlir-opt test.mlir --test-lower-to-nvvm
<unknown>:0: error: failed to legalize operation 'builtin.unrealized_conversion_cast' that was explicitly marked illegal
<unknown>:0: note: see current operation: %18 = "builtin.unrealized_conversion_cast"(%17) : (!llvm.struct<(ptr, ptr, i64)>) -> memref<f64>

Perhaps my repository is out of date with upstream changes. I will try to merge against HEAD and see what happens.

My fork of the repository was out of date. I’m not sure what the exact commits that fixed things were, but the reference pipeline works for this example now – thanks!