Hi there,
Assuming I have a pass that generates parallel loops and attributes them with GPU mapping attributes:
#map0 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
#map2 = affine_map<(d0) -> (d0)>
module {
func @function(%arg0: memref<64x1024xf32>, %arg1: memref<64x1024xf32>, %arg2: memref<64x1024xf32>) {
%c1 = constant 1 : index
%c64 = constant 64 : index
%c1024 = constant 1024 : index
%c0 = constant 0 : index
scf.parallel (%arg3, %arg4) = (%c0, %c0) to (%c64, %c1024) step (%c1, %c1024) {
%0 = memref.subview %arg0[%arg3, %arg4] [1, 1024] [1, 1] : memref<64x1024xf32> to memref<1x1024xf32, #map0>
%1 = memref.subview %arg1[%arg3, %arg4] [1, 1024] [1, 1] : memref<64x1024xf32> to memref<1x1024xf32, #map0>
%2 = memref.subview %arg2[%arg3, %arg4] [1, 1024] [1, 1] : memref<64x1024xf32> to memref<1x1024xf32, #map0>
linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : memref<1x1024xf32, #map0>, memref<1x1024xf32, #map0>) outs(%2 : memref<1x1024xf32, #map0>) {
^bb0(%arg5: f32, %arg6: f32, %arg7: f32): // no predecessors
%3 = addf %arg5, %arg6 : f32
linalg.yield %3 : f32
}
scf.yield
} {mapping = [{bound = #map2, map = #map2, processor = 2 : i64}, {bound = #map2, map = #map2, processor = 1 : i64}, {bound = #map2, map = #map2, processor = 0 : i64}]}
return
}
}
As can be observed the inner loop is only one iteration and can be removed, and that is what the canonicalizer of the parallel op does, however it doesn’t keep the mapping attributes that were previously added:
/// mlir-opt --canonicalize
#map0 = affine_map<(d0, d1)[s0] -> (d0 * 1024 + s0 + d1)>
#map1 = affine_map<(d0, d1) -> (d0, d1)>
module {
func @function(%arg0: memref<64x1024xf32>, %arg1: memref<64x1024xf32>, %arg2: memref<64x1024xf32>) {
%c0 = constant 0 : index
%c64 = constant 64 : index
%c1 = constant 1 : index
scf.parallel (%arg3) = (%c0) to (%c64) step (%c1) {
%0 = memref.subview %arg0[%arg3, 0] [1, 1024] [1, 1] : memref<64x1024xf32> to memref<1x1024xf32, #map0>
%1 = memref.subview %arg1[%arg3, 0] [1, 1024] [1, 1] : memref<64x1024xf32> to memref<1x1024xf32, #map0>
%2 = memref.subview %arg2[%arg3, 0] [1, 1024] [1, 1] : memref<64x1024xf32> to memref<1x1024xf32, #map0>
linalg.generic {indexing_maps = [#map1, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%0, %1 : memref<1x1024xf32, #map0>, memref<1x1024xf32, #map0>) outs(%2 : memref<1x1024xf32, #map0>) {
^bb0(%arg4: f32, %arg5: f32, %arg6: f32): // no predecessors
%3 = addf %arg4, %arg5 : f32
linalg.yield %3 : f32
}
scf.yield
}
return
}
}
Looking into it, looks like the rewriter that performs the canonicalization creates a new parallel op and replaces the parallel op with the new op and doesn’t set attributes for the newly created op. Is this behavior intentional?
Assuming I have no access to the higher level pass that generated the uncanonicalized loops with attributes, how can I canonicalize the loop while preserving its attributes?
Thank you in advance