Issue with Removing Redundant memref.subview and memref.copy Operations in MLIR IR

Hello,

I’ve been working on optimizing some MLIR IR code and encountered an issue where certain redundant operations, specifically memref.subview and memref.copy, are not being eliminated despite using various optimization passes. Below is a simplified version of the IR I’m working with:

module attributes {llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", "onnx-mlir.symbol-postfix" = "conv_model"} {
  memref.global "private" constant @__constant_4xi32_0 : memref<4xi32> = dense<[0, 3, 1, 2]> {alignment = 64 : i64}
  memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[0, 2, 3, 1]> {alignment = 64 : i64}
  memref.global "private" constant @__constant_3xf32 : memref<3xf32> = dense<[0.0848027915, 0.182504401, -0.0572209656]> {alignment = 64 : i64}
  memref.global "private" constant @__constant_3x3x3x3xf32 : memref<3x3x3x3xf32> = dense<[[[[-0.0225044936, -0.0943698287, -0.120647207], [0.105502918, -0.126328245, -0.00880287587], [0.136676386, 0.191608056, -0.135147989]], [[0.0652842373, -0.00357380509, -0.161090687], [0.00233793259, -0.1222836, 0.0852289647], [-0.17497915, -0.162808597, 9.346090e-02]], [[0.0246987641, 0.191776618, 0.0953016728], [0.0329508334, 0.190744057, 0.0123830587], [0.0676434487, 0.0313460976, 0.111728624]]], [[[-0.033690244, -0.162302285, -0.0462143123], [-0.0214170814, -0.0673772842, 0.0388933718], [0.153362706, 0.135804817, -0.122447476]], [[-0.17577371, 0.0854461342, -0.104730204], [0.170401827, 0.118987992, -0.0499024391], [-0.110257789, -0.191433772, 0.0553221852]], [[0.161009535, 0.102270827, 0.0155283213], [0.0373522341, -0.0569170564, -0.0869269445], [0.012848258, 0.0481289476, 0.169258639]]], [[[-0.0899311602, -0.113618031, 0.181085333], [-0.165421411, 0.170553818, -0.0421401858], [0.155273303, -0.125412107, 0.0310223848]], [[-6.817500e-02, -0.0752154365, -0.189589798], [-0.186942071, -0.00761026144, -0.10571067], [0.126854047, 0.14980723, 0.0658958107]], [[0.0161474049, 0.184888825, -0.10079474], [0.00335976481, 0.138415352, -0.0892849564], [-0.0695816576, 0.105078682, -0.0628482103]]]]> {alignment = 64 : i64}
  
  func.func @main_graph(%arg0: memref<1x3x4x4xf32> {onnx.name = "data"}, %arg1: memref<1x3x4x4xf32> {onnx.name = "output"}) {
    %c6 = arith.constant 6 : index
    %c4 = arith.constant 4 : index
    %c3 = arith.constant 3 : index
    %c1 = arith.constant 1 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %alloca = memref.alloca() : memref<1x1x1x1xf32>
    %alloca_0 = memref.alloca() : memref<1x1x1x1xf32>
    %0 = memref.get_global @__constant_3x3x3x3xf32 : memref<3x3x3x3xf32>
    %1 = memref.get_global @__constant_3xf32 : memref<3xf32>
    %alloca_1 = memref.alloca() {alignment = 64 : i64} : memref<1x4x4x3xf32>
    scf.for %arg2 = %c0 to %c4 step %c1 {
      scf.for %arg3 = %c0 to %c4 step %c1 {
        scf.for %arg4 = %c0 to %c3 step %c1 {
          %2 = memref.load %arg0[%c0, %arg4, %arg2, %arg3] : memref<1x3x4x4xf32>
          memref.store %2, %alloca_1[%c0, %arg2, %arg3, %arg4] : memref<1x4x4x3xf32>
        }
      }
    }
    %alloca_2 = memref.alloca() {alignment = 64 : i64} : memref<1x6x6x3xf32>
    scf.for %arg2 = %c0 to %c6 step %c1 {
      scf.for %arg3 = %c0 to %c6 step %c1 {
        scf.for %arg4 = %c0 to %c3 step %c1 {
          memref.store %cst, %alloca_2[%c0, %arg2, %arg3, %arg4] : memref<1x6x6x3xf32>
        }
      }
    }
    %subview = memref.subview %alloca_2[0, 1, 1, 0] [1, 4, 4, 3] [1, 1, 1, 1] : memref<1x6x6x3xf32> to memref<1x4x4x3xf32, strided<[108, 18, 3, 1], offset: 21>>
    memref.copy %alloca_1, %subview : memref<1x4x4x3xf32> to memref<1x4x4x3xf32, strided<[108, 18, 3, 1], offset: 21>>
    scf.for %arg2 = %c0 to %c3 step %c1 {
      scf.for %arg3 = %c0 to %c4 step %c1 {
        scf.for %arg4 = %c0 to %c4 step %c1 {
          %2 = memref.load %1[%arg2] : memref<3xf32>
          memref.store %2, %alloca[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32>
          scf.for %arg5 = %c0 to %c3 step %c1 {
            scf.for %arg6 = %c0 to %c3 step %c1 {
              scf.for %arg7 = %c0 to %c3 step %c1 {
                %4 = memref.load %0[%arg2, %arg7, %arg5, %arg6] : memref<3x3x3x3xf32>
                memref.store %4, %alloca_0[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32>
                %5 = arith.addi %arg3, %arg5 : index
                %6 = arith.addi %arg4, %arg6 : index
                %7 = memref.load %alloca_2[%c0, %5, %6, %arg7] : memref<1x6x6x3xf32>
                %8 = memref.load %alloca[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32>
                %9 = arith.mulf %7, %8 : f32
                %10 = arith.addf %7, %9 : f32
                memref.store %10, %alloca_2[%c0, %5, %6, %arg7] : memref<1x6x6x3xf32>
              }
            }
          }
        }
      }
    }
    return
  }
}

I’ve tried using the following commands to optimize the IR:

--cse  --sccp --canonicalize="top-down"

However, the memref.subview and memref.copy operations are still not being removed even though they seem redundant. For instance, in this IR, the subview operation is only used once immediately after its creation, and the data in the destination is not subsequently used.
My questions are:

  1. Why aren’t these redundant operations being eliminated by the optimization passes?
  2. Are there additional optimization passes or techniques I should consider to remove such redundant operations?
  3. Would it be advisable to manually modify the IR, or is there a better approach within the MLIR framework to address this?

Any guidance on how to handle this situation would be greatly appreciated!

Thank you!

This issue seems to be caused by padding, could that be the case?