Hi all!
I have a fundamental question about how tensor.empty
should be used and whether it’s necessary to run CSE with the goal of removing redundant tensor.empty
before converting them to bufferization.alloc_tensor
and running one-shot-bufferize
.
From the documentation, tensor.empty
is simply used to communicate the shapes for DestinationStyleOpInterface
ops; however, the eliminate-empty-tensors
pass only eliminates tensor.empty
under some cases where the tensor is inserted into other tensors. This means that tensor.empty
ops that aren’t eliminated will be converted to bufferization.alloc_tensor
and correspond to a memref.alloc
after one-shot-bufferize
.
So this leads to my question, when creating a linalg op (or any other ops that implement the DestinationStyleOpInterface), should I:
1. always create a new tensor.empty
op with the right shape for its destination operand, or
2. try to reuse tensors returned from other ops
I have an example input where having a new tensor.empty
for each destination operand results in one-shot-bufferize
allocating one more buffer compared to reusing tensor.empty
:
func.func @test(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x512xbf16>) -> tensor<512x512xbf16> {
%0 = tensor.empty() : tensor<512x512xbf16>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<512x512xbf16>, tensor<512x512xbf16>) outs(%0 : tensor<512x512xbf16>) {
^bb0(%in: bf16, %in_0: bf16, %out: bf16):
%6 = arith.mulf %in, %in_0 : bf16
linalg.yield %6 : bf16
} -> tensor<512x512xbf16>
%2 = tensor.empty() : tensor<512x512xbf16>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%1, %1 : tensor<512x512xbf16>, tensor<512x512xbf16>) outs(%2 : tensor<512x512xbf16>) {
^bb0(%in: bf16, %in_0: bf16, %out: bf16):
%6 = arith.mulf %in, %in_0 : bf16
linalg.yield %6 : bf16
} -> tensor<512x512xbf16>
%4 = tensor.empty() : tensor<512x512xbf16>
%5 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%3, %arg1 : tensor<512x512xbf16>, tensor<512x512xbf16>) outs(%4 : tensor<512x512xbf16>) {
^bb0(%in: bf16, %in_0: bf16, %out: bf16):
%6 = arith.addf %in, %in_0 : bf16
linalg.yield %6 : bf16
} -> tensor<512x512xbf16>
return %5 : tensor<512x512xbf16>
}
Running the above with flags --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs=true allow-unknown-ops=false bufferize-function-boundaries=true"
, I got the following codegen with 3 memref.alloc
:
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @optimized(%arg0: memref<512x512xbf16, strided<[?, ?], offset: ?>>, %arg1: memref<512x512xbf16, strided<[?, ?], offset: ?>>) -> memref<512x512xbf16> {
%alloc = memref.alloc() {alignment = 64 : i64} : memref<512x512xbf16>
linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<512x512xbf16, strided<[?, ?], offset: ?>>, memref<512x512xbf16, strided<[?, ?], offset: ?>>) outs(%alloc : memref<512x512xbf16>) {
^bb0(%in: bf16, %in_2: bf16, %out: bf16):
%0 = arith.mulf %in, %in_2 : bf16
linalg.yield %0 : bf16
}
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x512xbf16>
linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc, %alloc : memref<512x512xbf16>, memref<512x512xbf16>) outs(%alloc_0 : memref<512x512xbf16>) {
^bb0(%in: bf16, %in_2: bf16, %out: bf16):
%0 = arith.mulf %in, %in_2 : bf16
linalg.yield %0 : bf16
}
%alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<512x512xbf16>
linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc_0, %arg1 : memref<512x512xbf16>, memref<512x512xbf16, strided<[?, ?], offset: ?>>) outs(%alloc_1 : memref<512x512xbf16>) {
^bb0(%in: bf16, %in_2: bf16, %out: bf16):
%0 = arith.addf %in, %in_2 : bf16
linalg.yield %0 : bf16
}
memref.dealloc %alloc : memref<512x512xbf16>
memref.dealloc %alloc_0 : memref<512x512xbf16>
return %alloc_1 : memref<512x512xbf16>
}
}
Rerunning the above flags but with CSE (--cse --eliminate-empty-tensors --empty-tensor-to-alloc-tensor --one-shot-bufferize="allow-return-allocs=true allow-unknown-ops=false bufferize-function-boundaries=true"
) so that the redundant tensor.empty
ops are eliminated produced the following codegen:
#map = affine_map<(d0, d1) -> (d0, d1)>
module {
func.func @optimized(%arg0: memref<512x512xbf16, strided<[?, ?], offset: ?>>, %arg1: memref<512x512xbf16, strided<[?, ?], offset: ?>>) -> memref<512x512xbf16> {
%alloc = memref.alloc() {alignment = 64 : i64} : memref<512x512xbf16>
linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : memref<512x512xbf16, strided<[?, ?], offset: ?>>, memref<512x512xbf16, strided<[?, ?], offset: ?>>) outs(%alloc : memref<512x512xbf16>) {
^bb0(%in: bf16, %in_1: bf16, %out: bf16):
%0 = arith.mulf %in, %in_1 : bf16
linalg.yield %0 : bf16
}
%alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x512xbf16>
linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc, %alloc : memref<512x512xbf16>, memref<512x512xbf16>) outs(%alloc_0 : memref<512x512xbf16>) {
^bb0(%in: bf16, %in_1: bf16, %out: bf16):
%0 = arith.mulf %in, %in_1 : bf16
linalg.yield %0 : bf16
}
linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc_0, %arg1 : memref<512x512xbf16>, memref<512x512xbf16, strided<[?, ?], offset: ?>>) outs(%alloc : memref<512x512xbf16>) {
^bb0(%in: bf16, %in_1: bf16, %out: bf16):
%0 = arith.addf %in, %in_1 : bf16
linalg.yield %0 : bf16
}
memref.dealloc %alloc_0 : memref<512x512xbf16>
return %alloc : memref<512x512xbf16>
}
}
In this case, we see only 2 memref.alloc
.
Another question I have is: Is it necessary and correct to always use tensor.empty
for the destination operands? Would something like this be correct (I’m reusing the result tensors of a linalg ops as output param for other linalg ops)?
func.func @test(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x512xbf16>) -> tensor<512x512xbf16> {
%0 = tensor.empty() : tensor<512x512xbf16>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<512x512xbf16>, tensor<512x512xbf16>) outs(%0 : tensor<512x512xbf16>) {
^bb0(%in: bf16, %in_0: bf16, %out: bf16):
%4 = arith.mulf %in, %in_0 : bf16
linalg.yield %4 : bf16
} -> tensor<512x512xbf16>
%2 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%1, %1 : tensor<512x512xbf16>, tensor<512x512xbf16>) outs(%1 : tensor<512x512xbf16>) {
^bb0(%in: bf16, %in_0: bf16, %out: bf16):
%4 = arith.mulf %in, %in_0 : bf16
linalg.yield %4 : bf16
} -> tensor<512x512xbf16>
%3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %arg1 : tensor<512x512xbf16>, tensor<512x512xbf16>) outs(%2 : tensor<512x512xbf16>) {
^bb0(%in: bf16, %in_0: bf16, %out: bf16):
%4 = arith.addf %in, %in_0 : bf16
linalg.yield %4 : bf16
} -> tensor<512x512xbf16>
return %3 : tensor<512x512xbf16>
}
Thank you!