Problem with ConvertElementwiseToLinalg followed by LinalgFusionOfTensorOps

On running mlir-opt with -convert-elementwise-to-linalg pass
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = addf %arg0, %arg1 : tensor<4x3x2x2xf32>
%1 = mulf %0, %arg1 : tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}
the Output is
IR Dump After ConvertElementwiseToLinalg
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%arg0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
linalg.yield %2 : f32
} → tensor<4x3x2x2xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} → tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}

Further running the pass with LinalgFusionOfTensorOps gives output as
IR Dump After LinalgFusionOfTensorOps
#map = affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>
module {
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%arg0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
linalg.yield %2 : f32
} → tensor<4x3x2x2xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
%3 = mulf %2, %arg3 : f32
linalg.yield %3 : f32
} → tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}
}

  1. Basically, ConvertElementwiseToLinalg created 2 linalg generic regions and each with addf and mulf and LinalgFusionOfTensorOps fuses them together and creates a single region with both addf/mulf together so the output expected was a single region.
  2. Both the linalg regions created in ConvertElementwiseToLinalg are using arg0 input as output .

#map = affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>
module {
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {

%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
  %2 = addf %arg2, %arg3 : f32
  %3 = mulf %2, %arg3 : f32
  linalg.yield %3 : f32
} -> tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>

}
}

On commenting the logic in seems to add %0 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
as ouput for outputs of both linalg regions

llvm-project/ElementwiseToLinalg.cpp at main · llvm/llvm-project · GitHub till
llvm-project/ElementwiseToLinalg.cpp at main · llvm/llvm-project · GitHub
#if 0
bool found = false;
for (Value v : operands) {
if (v.getType() == t) {
found = true;
res.push_back(v);
break;
}
}
if (found)
continue;
#endif

Output becomes
IR Dump After ConvertElementwiseToLinalg
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%4 = addf %arg2, %arg3 : f32
linalg.yield %4 : f32
} → tensor<4x3x2x2xf32>
%2 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%1, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%2 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%4 = mulf %arg2, %arg3 : f32
linalg.yield %4 : f32
} → tensor<4x3x2x2xf32>
return %3 : tensor<4x3x2x2xf32>
}

Dump After LinalgFusionOfTensorOps
#map = affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>
module {
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
%3 = mulf %2, %arg3 : f32
linalg.yield %3 : f32
} → tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}
}

This ouptut from ConvertElementwiseToLinalg followed by LinalgFusionOfTensorOps looks correct .

I dont know the specifics of the ConvertElementwiseToLinalg lowering, but if you run -canonicalize after the lowering that will break the artificial dependence created by the use of the same value to the ins and outs. Instead it will create an linalg.init_tensor like what you have in the second input. Then running fusion will give you what you want. In both cases fusion is doing the right thing. Its just that in one case, the operation can’t be removed since there is still a use of the unfused op (for its shape, and not its value)

Nit: Please use markdown formatting, especially for code-snippets. Its really hard to read the post without it.

Hi Mahesh I did run the canonicalizer convert-elementwise-to-linalg after but still, it didn’t seem to add the linalg.init_tensor and the result seems to be the same.
" after the lowering that will break the artificial dependence created by the use of the same value to the ins and outs . Instead it will create an linalg.init_tensor"

BTW what canonicalization pattern gets invoked which creates the needed linalg.init_tensor.

I tried to look into the cannincal patterns registered by
llvm-project/LinalgOps.cpp at main · llvm/llvm-project · GitHub but couldn’t get one .
ReplaceStaticShapeDims seems to fold the constant but again it needs to find a linalg.init_tensor pattern first .
/// %c5 = constant 5: index
/// %0 = linalg.init_tensor [%arg0, %c5] : tensor<?x?xf32>
/// to
/// %0 = linalg.init_tensor [%arg0, 5] : tensor<?x5xf32>

/data/dkd/iree_release/iree-build/iree/tools/iree-opt -print-ir-before-all --print-ir-after-all -convert-elementwise-to-linalg --canonicalize convert-elementwise-to-linalg_4d_add_mul_only.mlir

 IR Dump After Canonicalizer  
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module  {
  func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) -> tensor<4x3x2x2xf32> {
    %0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%arg0 : tensor<4x3x2x2xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = addf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4x3x2x2xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = mulf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4x3x2x2xf32>
    return %1 : tensor<4x3x2x2xf32>
}

Sorry, my memory fails me. It was intended as a canonicalization pattern, but it was latter moved to the fusion pass here. It is added to the list of patterns applied in the pass here (invoked from within the pass here). So this should work. So canonicalizer should not be needed. The fusion pass should handle it by itself. I tried this locally

$ cat fusion.mlir 
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module  {
  func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) -> tensor<4x3x2x2xf32> {
    %0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%arg0 : tensor<4x3x2x2xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = addf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4x3x2x2xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = mulf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4x3x2x2xf32>
    return %1 : tensor<4x3x2x2xf32>
  }
}

$ mlir-opt -linalg-fusion-for-tensor-ops fusion.mlir 
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module  {
  func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) -> tensor<4x3x2x2xf32> {
    %0 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = addf %arg2, %arg3 : f32
      %3 = mulf %2, %arg3 : f32
      linalg.yield %3 : f32
    } -> tensor<4x3x2x2xf32>
    return %1 : tensor<4x3x2x2xf32>
  }
}

Are you running this with ToT?

Hi Mahesh,
My Bad, Seems I was at a bit older version of both iree and mlir and those were not having those updated changes for Linalg RemoveOutsDependency

I updated to the ToT and seems to work correctly

Seems ConvertElementwiseToLinalg is intended to wrap Linalg region around Elementwise operators .


(base) ubuntu@HYD1PNF01:/data/dkd/mlir_release/llvm-project/mlir/lib/Dialect/Linalg/Transforms$ git log
commit f653313d4aec6f92b224ef996a8ac236dbb48baf (HEAD → main, origin/main, origin/HEAD)
Author: Chia-hung Duan <chiahungduan@google.com>
Date: Wed May 12 11:21:25 2021 +0800 [mlir][AsmPrinter] Remove recursion while SSA naming

(base) ubuntu@HYD1PNF01:/data/dkd/iree_release/iree/third_party/llvm-project/mlir/lib/Dialect/Linalg/Transforms$ git log
commit b2780cd744eaad6f5c7f39165054cf7000a1ff07 (HEAD)
Author: Martin Probst <martin@probst.io>
Date: Thu Apr 29 11:35:27 2021 +0200 clang-format: [JS] handle “off” in imports

/data/dkd/mlir_release/llvm-project/build/bin/mlir-opt -print-ir-before-all --print-ir-after-all -convert-elementwise-to-linalg --linalg-fusion-for-tensor-ops convert-elementwise-to-linalg_4d_add_mul_only.mlir
// *** IR Dump Before ConvertElementwiseToLinalg ***
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) -> tensor<4x3x2x2xf32> {
  %0 = addf %arg0, %arg1 : tensor<4x3x2x2xf32>
  %1 = mulf %0, %arg1 : tensor<4x3x2x2xf32>
  return %1 : tensor<4x3x2x2xf32>
}

// *** IR Dump After ConvertElementwiseToLinalg ***
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) -> tensor<4x3x2x2xf32> {
  %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%arg0 : tensor<4x3x2x2xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
    %2 = addf %arg2, %arg3 : f32
    linalg.yield %2 : f32
  } -> tensor<4x3x2x2xf32>
  %1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
  ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
    %2 = mulf %arg2, %arg3 : f32
    linalg.yield %2 : f32
  } -> tensor<4x3x2x2xf32>
  return %1 : tensor<4x3x2x2xf32>
}

// *** IR Dump Before LinalgFusionOfTensorOps ***
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module  {
  func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) -> tensor<4x3x2x2xf32> {
    %0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%arg0 : tensor<4x3x2x2xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = addf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4x3x2x2xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = mulf %arg2, %arg3 : f32
      linalg.yield %2 : f32
    } -> tensor<4x3x2x2xf32>
    return %1 : tensor<4x3x2x2xf32>
  }
}


// *** IR Dump After LinalgFusionOfTensorOps ***
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module  {
  func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) -> tensor<4x3x2x2xf32> {
    %0 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
    ^bb0(%arg2: f32, %arg3: f32, %arg4: f32):  // no predecessors
      %2 = addf %arg2, %arg3 : f32
      %3 = mulf %2, %arg3 : f32
      linalg.yield %3 : f32
    } -> tensor<4x3x2x2xf32>
    return %1 : tensor<4x3x2x2xf32>
  }
}