Help lowering affine loop to OpenMP

I’m having trouble getting an affine loop nest (that looks slightly different than usual) lowered to OpenMP due to some unrealized conversion casts. I didn’t see integration tests or compiler pipelines within the repository that have an OpenMP lowering phase (though i definitely could be looking in the wrong places). The code I’m trying to lower is this:

#map = affine_map<(d0) -> (d0)>
module {
  func.func @kernel(%arg0: memref<?xf64>, %arg1: memref<f64>, %arg2: memref<?xf64>, %arg3: memref<2xindex>, %arg4: memref<2xindex>, %arg5: memref<2xindex>) attributes {llvm.emit_c_interface} {
    %c0 = arith.constant 0 : index
    %0 = affine.load %arg5[%c0] : memref<2xindex>
    %c1 = arith.constant 1 : index
    %1 = affine.load %arg5[%c1] : memref<2xindex>
    %2 = affine.load %arg1[] : memref<f64>
    affine.parallel (%arg6) = (%c0) to (%0) {
      affine.for %arg7 = #map(%c0) to #map(%1) {
        %3 = affine.load %arg2[%arg6] : memref<?xf64>
        %4 = arith.divf %3, %2 : f64
        affine.store %4, %arg2[%arg6] : memref<?xf64>
      }
    }
    return
  }
}

And my conversion code is:

    pm.addNestedPass<mlir::func::FuncOp>(mlir::createLowerAffinePass());
    pm.addNestedPass<mlir::func::FuncOp>(mlir::arith::createArithExpandOpsPass());
    pm.addPass(mlir::createConvertSCFToOpenMPPass());
    pm.addPass(mlir::createConvertOpenMPToLLVMPass());
    pm.addPass(mlir::createConvertSCFToCFPass());
    pm.addPass(mlir::createConvertControlFlowToLLVMPass());
    pm.addPass(mlir::memref::createExpandStridedMetadataPass());
    pm.addPass(mlir::createFinalizeMemRefToLLVMConversionPass());
    pm.addPass(mlir::createConvertMathToLLVMPass());
    pm.addPass(mlir::createConvertMathToLibmPass());
    pm.addPass(mlir::createConvertFuncToLLVMPass());
    pm.addPass(mlir::createConvertIndexToLLVMPass());
    pm.addPass(mlir::createReconcileUnrealizedCastsPass());

This pipeline has worked for loop nests that are fully parallel, but this one is a parallel loop with an inner sequential loop. I get the following error:

loc("binary_op"): error: failed to legalize operation 'builtin.unrealized_conversion_cast' that was explicitly marked illegal
module attributes {llvm.data_layout = ""} {
  llvm.func @kernel(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: !llvm.ptr, %arg6: !llvm.ptr, %arg7: i64, %arg8: !llvm.ptr, %arg9: !llvm.ptr, %arg10: i64, %arg11: i64, %arg12: i64, %arg13: !llvm.ptr, %arg14: !llvm.ptr, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: !llvm.ptr, %arg19: !llvm.ptr, %arg20: i64, %arg21: i64, %arg22: i64, %arg23: !llvm.ptr, %arg24: !llvm.ptr, %arg25: i64, %arg26: i64, %arg27: i64) attributes {llvm.emit_c_interface} {
    %0 = llvm.mlir.constant(0 : index) : i64
    %1 = llvm.mlir.constant(1 : index) : i64
    %2 = builtin.unrealized_conversion_cast %1 : i64 to index
    %3 = builtin.unrealized_conversion_cast %0 : i64 to index
    %4 = llvm.load %arg24 : !llvm.ptr -> i64
    %5 = llvm.getelementptr %arg24[1] : (!llvm.ptr) -> !llvm.ptr, i64
    %6 = llvm.load %5 : !llvm.ptr -> i64
    %7 = builtin.unrealized_conversion_cast %6 : i64 to index
    %8 = llvm.load %arg6 : !llvm.ptr -> f64
    omp.parallel   {
      omp.wsloop   for  (%arg28) : i64 = (%0) to (%4) step (%1) {
        %9 = llvm.intr.stacksave : !llvm.ptr
        llvm.br ^bb1
      ^bb1:  // pred: ^bb0
        cf.br ^bb2(%3 : index)
      ^bb2(%10: index):  // 2 preds: ^bb1, ^bb3
        %11 = builtin.unrealized_conversion_cast %10 : index to i64
        %12 = llvm.icmp "slt" %11, %6 : i64
        llvm.cond_br %12, ^bb3, ^bb4
      ^bb3:  // pred: ^bb2
        %13 = llvm.getelementptr %arg9[%arg28] : (!llvm.ptr, i64) -> !llvm.ptr, f64
        %14 = llvm.load %13 : !llvm.ptr -> f64
        %15 = llvm.fdiv %14, %8  : f64
        %16 = llvm.getelementptr %arg9[%arg28] : (!llvm.ptr, i64) -> !llvm.ptr, f64
        llvm.store %15, %16 : f64, !llvm.ptr
        %17 = llvm.add %11, %1  : i64
        %18 = builtin.unrealized_conversion_cast %17 : i64 to index
        cf.br ^bb2(%18 : index)
      ^bb4:  // pred: ^bb2
        llvm.intr.stackrestore %9 : !llvm.ptr
        llvm.br ^bb5
      ^bb5:  // pred: ^bb4
        omp.yield
      }
      omp.terminator
    }
    llvm.return
  }
...

Based on looking at the output from the pass manager, the conversion casts are introduced by the OpenMPToLLVMPass and the FuncToLLVMPass, but are not getting removed by the IndexToLLVMPass. What am I doing wrong?

The following sequence worked for me.

./bin/mlir-opt -lower-affine -convert-scf-to-openmp -convert-func-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-openmp-to-llvm -convert-index-to-llvm -reconcile-unrealized-casts af.mlir

Thank you, this worked! Was this documented somewhere?

I don’t think this is documented. Since this involves a few dialects, it is also not clear whether it can be documented.

The scf-to-openmp pass created a memref.alloca_scope and the alloca scope only works with a single block. So I decided to run the finalize-memref-to-llvm conversion and then the convert-scf-to-cf pass. I believe the other passes can be run in any order after this.