I’m looking for help in getting my conversion pipeline for GPU + SCF to LLVM working. It’s been working fine so far, but I ran into some new programs now where it’s not working, and am looking to fix what’s going wrong.
I’m trying to lower the following snippet of code. There are various idiosyncrasies in this code like loading dimension values from memrefs rather than using memref.dim – please ignore things like this, there is a good reason it is done this way.
#map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
#map2 = affine_map<(d0, d1, d2) -> (256, d1 - d2)>
module attributes {gpu.container_module} {
func.func @legateMLIRKernel41(%arg0: memref<?x?xf64>, %arg1: memref<f64>, %arg2: memref<2xindex>, %arg3: memref<0xindex>) attributes {llvm.emit_c_interface} {
%c256 = arith.constant 256 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0x7FF0000000000000 : f64
%0 = memref.load %arg1[] : memref<f64>
%1 = memref.load %arg2[%c0] : memref<2xindex>
%2 = memref.load %arg2[%c1] : memref<2xindex>
%3 = arith.muli %1, %2 : index
%4 = gpu.wait async
%memref, %asyncToken = gpu.alloc async [%4] () : memref<f64>
%alloca = memref.alloca() : memref<f64>
memref.store %cst, %alloca[] : memref<f64>
%5 = gpu.memcpy async [%asyncToken] %memref, %alloca : memref<f64>, memref<f64>
%c1_0 = arith.constant 1 : index
%6 = affine.apply #map(%3)[%c0, %c256]
%c256_1 = arith.constant 256 : index
%7 = affine.apply #map(%c256_1)[%c0, %c1]
%8 = gpu.launch_func async [%5] @legateMLIRKernel41_kernel::@legateMLIRKernel41_kernel blocks in (%6, %c1_0, %c1_0) threads in (%7, %c1_0, %c1_0) args(%3 : index, %2 : index, %arg0 : memref<?x?xf64>, %memref : memref<f64>)
%alloca_2 = memref.alloca() : memref<f64>
%9 = gpu.memcpy async [%8] %alloca_2, %memref : memref<f64>, memref<f64>
%10 = memref.load %alloca_2[] : memref<f64>
%11 = arith.minf %0, %10 : f64
memref.store %11, %arg1[] : memref<f64>
return
}
gpu.module @legateMLIRKernel41_kernel {
gpu.func @legateMLIRKernel41_kernel(%arg0: index, %arg1: index, %arg2: memref<?x?xf64>, %arg3: memref<f64>) kernel {
%0 = gpu.block_id x
%1 = gpu.block_id y
%2 = gpu.block_id z
%3 = gpu.thread_id x
%4 = gpu.thread_id y
%5 = gpu.thread_id z
%6 = gpu.grid_dim x
%7 = gpu.grid_dim y
%8 = gpu.grid_dim z
%9 = gpu.block_dim x
%10 = gpu.block_dim y
%11 = gpu.block_dim z
cf.br ^bb1
^bb1: // pred: ^bb0
%c256 = arith.constant 256 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%cst = arith.constant 0x7FF0000000000000 : f64
%12 = affine.apply #map1(%0)[%c256, %c0]
%13 = affine.min #map2(%c256, %arg0, %12)
%14 = affine.apply #map1(%3)[%c1, %c0]
%15 = arith.cmpi slt, %14, %13 : index
%16 = scf.if %15 -> (f64) {
%20 = arith.addi %14, %12 : index
%21 = arith.remsi %20, %arg1 : index
%22 = arith.divsi %20, %arg1 : index
%23 = memref.load %arg2[%22, %21] : memref<?x?xf64>
scf.yield %23 : f64
} else {
scf.yield %cst : f64
}
%17 = gpu.all_reduce min %16 uniform {
} : (f64) -> f64
%18 = gpu.thread_id x
%c0_0 = arith.constant 0 : index
%19 = arith.cmpi eq, %18, %c0_0 : index
scf.if %19 {
%20 = memref.atomic_rmw minf %17, %arg3[] : (f64, memref<f64>) -> f64
}
gpu.return
}
}
}
And my conversion pipeline looks like this, which is mostly adapted from https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp.
pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertSCFToCFPass());
pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createLowerGpuOpsToNVVMOpsPass());
pm.addPass(mlir::createConvertSCFToCFPass());
pm.addPass(mlir::memref::createExpandStridedMetadataPass());
pm.addPass(mlir::createLowerAffinePass());
pm.addPass(mlir::createConvertIndexToLLVMPass());
pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
pm.addPass(mlir::arith::createArithExpandOpsPass());
pm.addPass(mlir::createConvertComplexToLLVMPass());
pm.addPass(mlir::createConvertMathToLLVMPass());
pm.addPass(mlir::createConvertMathToLibmPass());
pm.addPass(mlir::createConvertFuncToLLVMPass());
pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createConvertNVGPUToNVVMPass());
pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::NVVM::createOptimizeForTargetPass());
pm.addNestedPass<mlir::gpu::GPUModuleOp>(mlir::createGpuSerializeToCubinPass("nvptx64-nvidia-cuda", "sm_60", "+ptx60"));
pm.addPass(mlir::createGpuToLLVMConversionPass());
pm.addPass(mlir::createReconcileUnrealizedCastsPass());
However, running this pipeline gives me a variety of unrealized_conversion_casts
around memref and index operations that I don’t understand how to debug further. This pass pipeline is a reordering of what I originally had, which also didn’t work. I’m not sure which of these passes are necessary / unnecessary, which of them need to be ordered and which don’t, etc. Any help would be appreciated!
As a meta-point, I think the conversion infrastructure has been the most opaque portion of MLIR for me to understand / use productively (so far). It’s unclear to me how to decide the magic order that these conversion passes need to be run in, as evidenced by a similar question here (Help lowering affine loop to OpenMP).