Liveness Analysis for Bufferization deallocation

@matthias-springer I noticed that the ownership based deallocation pass adds the bufferization.dealloc op at the end of the scope, and thus moves the deallocations for all buffers in the context block to the end.

This is safe to do, however it also keeps buffers alive when they’re long dead and can make larger models run out of memory. Given that we don’t yet reuse buffers of the same size across independent uses, this means dead memory.

@nicolasvasilache told me we do have a liveness analysis pass, have you done any work in using the result of that pass to instruct the deallocation pass where to add deallocations for each buffer?

We use the ownership deallocation pass in our research, but this isn’t a critical point for us (our models are still small). However, I imagine it could be a reasonably simple change. I’m just wondering if there’s anything done at this level and what’s the status of it.

Similar issue to us. We used BufferDeallocationPass previously, which inserted the deallocation as early as possible according to the analysis. Now we have to move the BufferDeallocationPipelinePass and the deallocations are inserted at the end. This change caused performance issue to us. Like to know what’s the solution to this issue.

Can you post some IR that shows the bufferization.dealloc ops? I.e., not running the entire pipeline but just -ownership-based-buffer-deallocation -buffer-deallocation-simplification -canonicalize. Maybe moving the bufferization.dealloc ops up as far as possible in the same block (such that op dominance is still satisfied) is sufficient in your case.

Here is the output of DeallocationPipeline of a simple example: a tensor sqrt followed by a tensor add. The free was inserted before the second loop nest, and now is at the end of the function.

// -----// IR Dump After ExpandRealloc (expand-realloc) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%5 = math.sqrt %4 : f32
memref.store %5, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_0 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_0, %dim : index
%1 = arith.select %0, %dim_0, %dim : index
%alloc_1 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_0, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = arith.select %2, %arg2, %c0 : index
%5 = memref.load %alloc[%4, %arg3] : memref<?x5xf32>
%6 = arith.select %3, %arg2, %c0 : index
%7 = memref.load %arg1[%6, %arg3] : memref<?x5xf32>
%8 = arith.addf %5, %7 : f32
memref.store %8, %alloc_1[%arg2, %arg3] : memref<?x5xf32>
}
}
return %alloc_1 : memref<?x5xf32>
}
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%5 = math.sqrt %4 : f32
memref.store %5, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_0 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_0, %dim : index
%1 = arith.select %0, %dim_0, %dim : index
%alloc_1 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_0, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = arith.select %2, %arg2, %c0 : index
%5 = memref.load %alloc[%4, %arg3] : memref<?x5xf32>
%6 = arith.select %3, %arg2, %c0 : index
%7 = memref.load %arg1[%6, %arg3] : memref<?x5xf32>
%8 = arith.addf %5, %7 : f32
memref.store %8, %alloc_1[%arg2, %arg3] : memref<?x5xf32>
}
}
return %alloc_1 : memref<?x5xf32>
}
}

// -----// IR Dump After OwnershipBasedBufferDeallocation (ownership-based-buffer-deallocation) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%false = arith.constant false
%true = arith.constant true
%false_0 = arith.constant false
%true_1 = arith.constant true
%false_2 = arith.constant false
%false_3 = arith.constant false
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
%false_18 = arith.constant false
%false_19 = arith.constant false
%false_20 = arith.constant false
scf.for %arg3 = %c0 to %c5 step %c1 {
%false_21 = arith.constant false
%false_22 = arith.constant false
%5 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%6 = math.sqrt %5 : f32
memref.store %6, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_4 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_4, %dim : index
%1 = arith.select %0, %dim_4, %dim : index
%alloc_5 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_4, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
%false_18 = arith.constant false
%false_19 = arith.constant false
%false_20 = arith.constant false
%false_21 = arith.constant false
scf.for %arg3 = %c0 to %c5 step %c1 {
%false_22 = arith.constant false
%false_23 = arith.constant false
%false_24 = arith.constant false
%5 = arith.select %2, %arg2, %c0 : index
%6 = memref.load %alloc[%5, %arg3] : memref<?x5xf32>
%7 = arith.select %3, %arg2, %c0 : index
%8 = memref.load %arg1[%7, %arg3] : memref<?x5xf32>
%9 = arith.addf %6, %8 : f32
memref.store %9, %alloc_5[%arg2, %arg3] : memref<?x5xf32>
}
}
%base_buffer, %offset, %sizes:2, %strides:2 = memref.extract_strided_metadata %arg0 : memref<?x5xf32> → memref, index, index, index, index, index
%base_buffer_6, %offset_7, %sizes_8:2, %strides_9:2 = memref.extract_strided_metadata %arg1 : memref<?x5xf32> → memref, index, index, index, index, index
%base_buffer_10, %offset_11, %sizes_12:2, %strides_13:2 = memref.extract_strided_metadata %alloc : memref<?x5xf32> → memref, index, index, index, index, index
%base_buffer_14, %offset_15, %sizes_16:2, %strides_17:2 = memref.extract_strided_metadata %alloc_5 : memref<?x5xf32> → memref, index, index, index, index, index
%4 = bufferization.dealloc (%base_buffer, %base_buffer_6, %base_buffer_10, %base_buffer_14 : memref, memref, memref, memref) if (%false_2, %false_3, %true_1, %true) retain (%alloc_5 : memref<?x5xf32>)
return %alloc_5 : memref<?x5xf32>
}
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%true = arith.constant true
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%5 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%6 = math.sqrt %5 : f32
memref.store %6, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_0 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_0, %dim : index
%1 = arith.select %0, %dim_0, %dim : index
%alloc_1 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_0, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%5 = arith.select %2, %arg2, %c0 : index
%6 = memref.load %alloc[%5, %arg3] : memref<?x5xf32>
%7 = arith.select %3, %arg2, %c0 : index
%8 = memref.load %arg1[%7, %arg3] : memref<?x5xf32>
%9 = arith.addf %6, %8 : f32
memref.store %9, %alloc_1[%arg2, %arg3] : memref<?x5xf32>
}
}
%4 = bufferization.dealloc (%alloc, %alloc_1 : memref<?x5xf32>, memref<?x5xf32>) if (%true, %true) retain (%alloc_1 : memref<?x5xf32>)
return %alloc_1 : memref<?x5xf32>
}
}

// -----// IR Dump After BufferDeallocationSimplification (buffer-deallocation-simplification) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%true = arith.constant true
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%5 = math.sqrt %4 : f32
memref.store %5, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_0 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_0, %dim : index
%1 = arith.select %0, %dim_0, %dim : index
%alloc_1 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_0, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = arith.select %2, %arg2, %c0 : index
%5 = memref.load %alloc[%4, %arg3] : memref<?x5xf32>
%6 = arith.select %3, %arg2, %c0 : index
%7 = memref.load %arg1[%6, %arg3] : memref<?x5xf32>
%8 = arith.addf %5, %7 : f32
memref.store %8, %alloc_1[%arg2, %arg3] : memref<?x5xf32>
}
}
bufferization.dealloc (%alloc : memref<?x5xf32>) if (%true)
return %alloc_1 : memref<?x5xf32>
}
}

// -----// IR Dump After LowerDeallocations (bufferization-lower-deallocations) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%true = arith.constant true
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%5 = math.sqrt %4 : f32
memref.store %5, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_0 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_0, %dim : index
%1 = arith.select %0, %dim_0, %dim : index
%alloc_1 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_0, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = arith.select %2, %arg2, %c0 : index
%5 = memref.load %alloc[%4, %arg3] : memref<?x5xf32>
%6 = arith.select %3, %arg2, %c0 : index
%7 = memref.load %arg1[%6, %arg3] : memref<?x5xf32>
%8 = arith.addf %5, %7 : f32
memref.store %8, %alloc_1[%arg2, %arg3] : memref<?x5xf32>
}
}
scf.if %true {
memref.dealloc %alloc : memref<?x5xf32>
}
return %alloc_1 : memref<?x5xf32>
}
}

// -----// IR Dump After CSE (cse) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%true = arith.constant true
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%5 = math.sqrt %4 : f32
memref.store %5, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_0 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_0, %dim : index
%1 = arith.select %0, %dim_0, %dim : index
%alloc_1 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_0, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = arith.select %2, %arg2, %c0 : index
%5 = memref.load %alloc[%4, %arg3] : memref<?x5xf32>
%6 = arith.select %3, %arg2, %c0 : index
%7 = memref.load %arg1[%6, %arg3] : memref<?x5xf32>
%8 = arith.addf %5, %7 : f32
memref.store %8, %alloc_1[%arg2, %arg3] : memref<?x5xf32>
}
}
scf.if %true {
memref.dealloc %alloc : memref<?x5xf32>
}
return %alloc_1 : memref<?x5xf32>
}
}

// -----// IR Dump After Canonicalizer (canonicalize) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%5 = math.sqrt %4 : f32
memref.store %5, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_0 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_0, %dim : index
%1 = arith.select %0, %dim_0, %dim : index
%alloc_1 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_0, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = arith.select %2, %arg2, %c0 : index
%5 = memref.load %alloc[%4, %arg3] : memref<?x5xf32>
%6 = arith.select %3, %arg2, %c0 : index
%7 = memref.load %arg1[%6, %arg3] : memref<?x5xf32>
%8 = arith.addf %5, %7 : f32
memref.store %8, %alloc_1[%arg2, %arg3] : memref<?x5xf32>
}
}
memref.dealloc %alloc : memref<?x5xf32>
return %alloc_1 : memref<?x5xf32>
}
}

// -----// IR Dump After ConvertBufferizationToMemRef (convert-bufferization-to-memref) //----- //
module attributes {llvm.data_layout = “e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128”, llvm.target_triple = “x86_64-apple-darwin22.3.0”, “onnx-mlir.symbol-postfix” = “fuse18”} {
func.func @fuse_element_18(%arg0: memref<?x5xf32>, %arg1: memref<?x5xf32>) → memref<?x5xf32> attributes {llvm.emit_c_interface} {
%c5 = arith.constant 5 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = memref.dim %arg0, %c0 : memref<?x5xf32>
%alloc = memref.alloc(%dim) {alignment = 16 : i64} : memref<?x5xf32>
scf.for %arg2 = %c0 to %dim step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = memref.load %arg0[%arg2, %arg3] : memref<?x5xf32>
%5 = math.sqrt %4 : f32
memref.store %5, %alloc[%arg2, %arg3] : memref<?x5xf32>
}
}
%dim_0 = memref.dim %arg1, %c0 : memref<?x5xf32>
%0 = arith.cmpi sgt, %dim_0, %dim : index
%1 = arith.select %0, %dim_0, %dim : index
%alloc_1 = memref.alloc(%1) {alignment = 16 : i64} : memref<?x5xf32>
%2 = arith.cmpi sgt, %dim, %c1 : index
%3 = arith.cmpi sgt, %dim_0, %c1 : index
scf.for %arg2 = %c0 to %1 step %c1 {
scf.for %arg3 = %c0 to %c5 step %c1 {
%4 = arith.select %2, %arg2, %c0 : index
%5 = memref.load %alloc[%4, %arg3] : memref<?x5xf32>
%6 = arith.select %3, %arg2, %c0 : index
%7 = memref.load %arg1[%6, %arg3] : memref<?x5xf32>
%8 = arith.addf %5, %7 : f32
memref.store %8, %alloc_1[%arg2, %arg3] : memref<?x5xf32>
}
}
memref.dealloc %alloc : memref<?x5xf32>
return %alloc_1 : memref<?x5xf32>
}
}

It is not as simple as I thought. We need an alias analysis. Based on that, we can push up the bufferization.dealloc ops as far up as possible. Right after the last use of any alias (not taking into account the terminator of the block).

I think we can build something on top of BufferViewFlowAnalysis.

1 Like