Thanks!
I am investigating the feasibility of using Affine dialect and transformations in Flang. One of the aspects is the ability to generate debug and TBAA information for Fortran programs, which is currently done pretty late in Flang pass pipeline. In order to preserve the source level information, Flang uses certain FIR operations like fir.declare and fir.dummy_scope. If I want to apply the Affine transformations “in the middle” of Flang pass pipeline, I may have MLIR like this:
// RUN: fir-opt %s -allow-unregistered-dialect -affine-parallelize
func.func @_QPtest1(%arg0 : memref<10xf32>) {
%cst = arith.constant 1.000000e+00 : f32
affine.for %arg2 = 0 to 10 {
%16 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%alloca_0 = memref.alloca() : memref<f32>
%17 = fir.convert %alloca_0 : (memref<f32>) -> !fir.ref<f32>
%18 = fir.dummy_scope : !fir.dscope
%20 = fir.declare %17 dummy_scope %18 arg 1 {uniq_name = "_QFtestFinnerEy"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
%21 = fir.convert %20 : (!fir.ref<f32>) -> memref<f32>
affine.store %cst, %21[] : memref<f32>
%22 = affine.load %21[] : memref<f32>
affine.store %22, %arg0[%16 - 1] : memref<10xf32>
}
return
}
func.func @_QPtest2(%arg0 : memref<10xf32>) {
%cst = arith.constant 1.000000e+00 : f32
affine.for %arg2 = 0 to 10 {
%16 = affine.apply affine_map<(d0) -> (d0 + 1)>(%arg2)
%alloca_0 = memref.alloca() : memref<f32>
%17 = fir.convert %alloca_0 : (memref<f32>) -> !fir.ref<f32>
%20 = fir.declare %17 {uniq_name = "_QFtestFinnerEy"} : (!fir.ref<f32>) -> !fir.ref<f32>
%21 = fir.convert %20 : (!fir.ref<f32>) -> memref<f32>
affine.store %cst, %21[] : memref<f32>
%22 = affine.load %21[] : memref<f32>
affine.store %22, %arg0[%16 - 1] : memref<10xf32>
}
return
}
In test1 I show potential MLIR mixing FIR/affine dialect operations - you may see the fir.dummy_scope in this example. Such code may appear due to MLIR inlining or due to OpenACC private variables materialization early in Flang FE or due to other reasons.
In test2 I manually removed fir.dummy_scope (i.e. I lost some information due to this).
I tested them using my modified fir-opt tool (with registered Affine passes, and ViewLikeOpInterface attached to fir.declare operation): -affine-parallelize can parallelize the loop in test2 but not in test1, because fir.dummy_scope has a MemWrite effect on FIR’s DebuggingResource. As I said before, DebuggingResource is used to guarantee fir.dummy_scope nesting (in the case of MLIR inlining), but it is just an aritificial “metadata” resource and it should not restrict parallelization in any way.
Output MLIR with my modified fir-opt:
#map = affine_map<(d0) -> (d0 + 1)>
module {
func.func @_QPtest1(%arg0: memref<10xf32>) {
%cst = arith.constant 1.000000e+00 : f32
affine.for %arg1 = 0 to 10 {
%0 = affine.apply #map(%arg1)
%alloca = memref.alloca() : memref<f32>
%1 = fir.convert %alloca : (memref<f32>) -> !fir.ref<f32>
%2 = fir.dummy_scope : !fir.dscope
%3 = fir.declare %1 dummy_scope %2 arg 1 {uniq_name = "_QFtestFinnerEy"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
%4 = fir.convert %3 : (!fir.ref<f32>) -> memref<f32>
affine.store %cst, %4[] : memref<f32>
%5 = affine.load %4[] : memref<f32>
affine.store %5, %arg0[%0 - 1] : memref<10xf32>
}
return
}
func.func @_QPtest2(%arg0: memref<10xf32>) {
%cst = arith.constant 1.000000e+00 : f32
affine.parallel (%arg1) = (0) to (10) {
%0 = affine.apply #map(%arg1)
%alloca = memref.alloca() : memref<f32>
%1 = fir.convert %alloca : (memref<f32>) -> !fir.ref<f32>
%2 = fir.declare %1 {uniq_name = "_QFtestFinnerEy"} : (!fir.ref<f32>) -> !fir.ref<f32>
%3 = fir.convert %2 : (!fir.ref<f32>) -> memref<f32>
affine.store %cst, %3[] : memref<f32>
%4 = affine.load %3[] : memref<f32>
affine.store %4, %arg0[%0 - 1] : memref<10xf32>
}
return
}
}