On running mlir-opt with -convert-elementwise-to-linalg pass
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = addf %arg0, %arg1 : tensor<4x3x2x2xf32>
%1 = mulf %0, %arg1 : tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}
the Output is
IR Dump After ConvertElementwiseToLinalg
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%arg0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
linalg.yield %2 : f32
} → tensor<4x3x2x2xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = mulf %arg2, %arg3 : f32
linalg.yield %2 : f32
} → tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}
Further running the pass with LinalgFusionOfTensorOps gives output as
IR Dump After LinalgFusionOfTensorOps
#map = affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>
module {
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%arg0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
linalg.yield %2 : f32
} → tensor<4x3x2x2xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
%3 = mulf %2, %arg3 : f32
linalg.yield %3 : f32
} → tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}
}
- Basically, ConvertElementwiseToLinalg created 2 linalg generic regions and each with addf and mulf and LinalgFusionOfTensorOps fuses them together and creates a single region with both addf/mulf together so the output expected was a single region.
- Both the linalg regions created in ConvertElementwiseToLinalg are using arg0 input as output .
#map = affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>
module {
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
%3 = mulf %2, %arg3 : f32
linalg.yield %3 : f32
} -> tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}
}
On commenting the logic in seems to add %0 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
as ouput for outputs of both linalg regions
llvm-project/ElementwiseToLinalg.cpp at main · llvm/llvm-project · GitHub till
llvm-project/ElementwiseToLinalg.cpp at main · llvm/llvm-project · GitHub
#if 0
bool found = false;
for (Value v : operands) {
if (v.getType() == t) {
found = true;
res.push_back(v);
break;
}
}
if (found)
continue;
#endif
Output becomes
IR Dump After ConvertElementwiseToLinalg
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
%1 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%4 = addf %arg2, %arg3 : f32
linalg.yield %4 : f32
} → tensor<4x3x2x2xf32>
%2 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
%3 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%1, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%2 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%4 = mulf %arg2, %arg3 : f32
linalg.yield %4 : f32
} → tensor<4x3x2x2xf32>
return %3 : tensor<4x3x2x2xf32>
}
Dump After LinalgFusionOfTensorOps
#map = affine_map<(d0, d1, d2, d3) → (d0, d1, d2, d3)>
module {
func @addf_rank0(%arg0: tensor<4x3x2x2xf32>, %arg1: tensor<4x3x2x2xf32>) → tensor<4x3x2x2xf32> {
%0 = linalg.init_tensor [4, 3, 2, 2] : tensor<4x3x2x2xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = [“parallel”, “parallel”, “parallel”, “parallel”]} ins(%arg0, %arg1 : tensor<4x3x2x2xf32>, tensor<4x3x2x2xf32>) outs(%0 : tensor<4x3x2x2xf32>) {
^bb0(%arg2: f32, %arg3: f32, %arg4: f32): // no predecessors
%2 = addf %arg2, %arg3 : f32
%3 = mulf %2, %arg3 : f32
linalg.yield %3 : f32
} → tensor<4x3x2x2xf32>
return %1 : tensor<4x3x2x2xf32>
}
}
This ouptut from ConvertElementwiseToLinalg followed by LinalgFusionOfTensorOps looks correct .