AffineDataCopyGeneration Issue with generating wrong buffer/size

I encounter one issue when using Pass AffineDataCopyGeneration. Following is my case, it can pass generation, but the result buffer (copy buffer) is wrong and the buffer size is wrong.

Here is my case:

// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=3 skip-non-unit-stride-loops" | FileCheck %s
// CHECK-LABEL: @test
func @test(%in0: memref<1x35x35x4xf32>, %in1: memref<4x4x4x32xf32>, %out: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
  affine.for %arg0 = 0 to 64 step 2 {
    affine.for %arg1 = 0 to 1024 step 256 {
        affine.for %arg3 = 0 to 256 step 4 {
          affine.for %arg4 = 0 to 4 {
            %5 = affine.load %out[(%arg1 + %arg3 + %arg4) floordiv 1024, ((%arg1 + %arg3 + %arg4) mod 1024) floordiv 32, (%arg1 + %arg3 + %arg4) mod 32, %arg0 floordiv 2] : memref<1x32x32x32xf32>
          }
        }
    }
  }
  return %out : memref<1x32x32x32xf32>
}

The command used:

mlir-opt datacopyfordivmod3.mlir -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=3 skip-non-unit-stride-loops" -print-ir-after-all -print-ir-before-all

The result log:

// -----// IR Dump Before AffineDataCopyGeneration //----- //
func @test(%arg0: memref<1x35x35x4xf32>, %arg1: memref<4x4x4x32xf32>, %arg2: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
  affine.for %arg3 = 0 to 64 step 2 {
    affine.for %arg4 = 0 to 1024 step 256 {
      affine.for %arg5 = 0 to 256 step 4 {
        affine.for %arg6 = 0 to 4 {
          %0 = affine.load %arg2[(%arg4 + %arg5 + %arg6) floordiv 1024, ((%arg4 + %arg5 + %arg6) mod 1024) floordiv 32, (%arg4 + %arg5 + %arg6) mod 32, %arg3 floordiv 2] : memref<1x32x32x32xf32>
        }
      }
    }
  }
  return %arg2 : memref<1x32x32x32xf32>
}

// -----// IR Dump After AffineDataCopyGeneration //----- //
func @test(%arg0: memref<1x35x35x4xf32>, %arg1: memref<4x4x4x32xf32>, %arg2: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %c0_0 = arith.constant 0 : index
  affine.for %arg3 = 0 to 64 step 2 {
    affine.for %arg4 = 0 to 1024 step 256 {
      affine.for %arg5 = 0 to 256 step 4 {
        %0 = affine.apply affine_map<(d0, d1, d2) -> (d2 floordiv 2)>(%arg4, %arg5, %arg3)
        %1 = memref.alloc() : memref<1x32x32x1xf32, 3>
        affine.for %arg6 = affine_map<(d0, d1) -> ((d0 + d1) floordiv 1024)>(%arg4, %arg5) to affine_map<(d0, d1) -> ((d0 + d1 + 3) floordiv 1024 + 1)>(%arg4, %arg5) {
          affine.for %arg7 = 0 to 32 {
            affine.for %arg8 = 0 to 32 {
              %2 = affine.load %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
              affine.store %2, %1[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
            }
          }
        }
        affine.for %arg6 = 0 to 4 {
          %2 = affine.apply affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) floordiv 1024)>(%arg4, %arg5, %arg6, %arg3)
          %3 = affine.apply affine_map<(d0, d1, d2, d3) -> (((d0 + d1 + d2) mod 1024) floordiv 32)>(%arg4, %arg5, %arg6, %arg3)
          %4 = affine.apply affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) mod 32)>(%arg4, %arg5, %arg6, %arg3)
          %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d3 floordiv 2)>(%arg4, %arg5, %arg6, %arg3)
          %6 = affine.load %1[(%arg4 + %arg5 + %arg6) floordiv 1024, ((%arg4 + %arg5 + %arg6) mod 1024) floordiv 32, (%arg4 + %arg5 + %arg6) mod 32, 0] : memref<1x32x32x1xf32, 3>
        }
        memref.dealloc %1 : memref<1x32x32x1xf32, 3>
      }
    }
  }
  return %arg2 : memref<1x32x32x32xf32>
}


#map0 = affine_map<(d0, d1, d2) -> (d2 floordiv 2)>
#map1 = affine_map<(d0, d1) -> ((d0 + d1) floordiv 1024)>
#map2 = affine_map<(d0, d1) -> ((d0 + d1 + 3) floordiv 1024 + 1)>
#map3 = affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) floordiv 1024)>
#map4 = affine_map<(d0, d1, d2, d3) -> (((d0 + d1 + d2) mod 1024) floordiv 32)>
#map5 = affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) mod 32)>
#map6 = affine_map<(d0, d1, d2, d3) -> (d3 floordiv 2)>
module  {
  func @test(%arg0: memref<1x35x35x4xf32>, %arg1: memref<4x4x4x32xf32>, %arg2: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %c0_0 = arith.constant 0 : index
    affine.for %arg3 = 0 to 64 step 2 {
      affine.for %arg4 = 0 to 1024 step 256 {
        affine.for %arg5 = 0 to 256 step 4 {
          %0 = affine.apply #map0(%arg4, %arg5, %arg3)
          %1 = memref.alloc() : memref<1x32x32x1xf32, 3>
          affine.for %arg6 = #map1(%arg4, %arg5) to #map2(%arg4, %arg5) {
            affine.for %arg7 = 0 to 32 {
              affine.for %arg8 = 0 to 32 {
                %2 = affine.load %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
                affine.store %2, %1[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
              }
            }
          }
          affine.for %arg6 = 0 to 4 {
            %2 = affine.apply #map3(%arg4, %arg5, %arg6, %arg3)
            %3 = affine.apply #map4(%arg4, %arg5, %arg6, %arg3)
            %4 = affine.apply #map5(%arg4, %arg5, %arg6, %arg3)
            %5 = affine.apply #map6(%arg4, %arg5, %arg6, %arg3)
            %6 = affine.load %1[(%arg4 + %arg5 + %arg6) floordiv 1024, ((%arg4 + %arg5 + %arg6) mod 1024) floordiv 32, (%arg4 + %arg5 + %arg6) mod 32, 0] : memref<1x32x32x1xf32, 3>
          }
          memref.dealloc %1 : memref<1x32x32x1xf32, 3>
        }
      }
    }
    return %arg2 : memref<1x32x32x32xf32>
  }
}

This is the generated copy LoopNest:

          %0 = affine.apply #map0(%arg4, %arg5, %arg3)
          %1 = memref.alloc() : memref<1x32x32x1xf32, 3>
          affine.for %arg6 = #map1(%arg4, %arg5) to #map2(%arg4, %arg5) {
            affine.for %arg7 = 0 to 32 {
              affine.for %arg8 = 0 to 32 {
                %2 = affine.load %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
                affine.store %2, %1[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
              }
            }

the generated buffer shape is 1x32x32x1xf32, but it is wrong. From reading the code, it should be 1x8x32x1xf32.

Any one know the reason? I noticed one special part of my code is that floordiv and mod. Those unit tests in MLIR source code doesn’t use floordiv and mode.
Do floordiv and mod in mapping lead to error in AffineCopyDataGeneration? Is it supported ?

Did you mean to say that the buffer shape is not tight/optimal (i.e., overapproximated) as opposed to wrong? Does the output violate semantics? Note that the data copy generation utility does not guarantee an optimal buffer size: however, if the buffer size is smaller than what was needed, that would be wrong and a bug. For eg. if you access memref points along a diagonal of a rectangle, you’ll still get the whole rectangle as the buffer size – this is sort of the worst-case scenario with a bounding box.

Thanks for reply.
Actually it should be wrong.

This unit-test is only about read of the buffer. It doesn’t matter if read more data in every inner-loop.
Here I create another test-case, which will write the buffer. then we can understand it will lead to error in real cases.

/ -----// IR Dump After AffineDataCopyGeneration //----- //
func @test(%arg0: memref<1x35x35x4xf32>, %arg1: memref<4x4x4x32xf32>, %arg2: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %c0_0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  affine.for %arg3 = 0 to 64 step 2 {
    affine.for %arg4 = 0 to 1024 step 256 {
      affine.for %arg5 = 0 to 256 step 4 {
        %0 = memref.alloc() : memref<1x32x32x1xf32, 3>
        affine.for %arg6 = 0 to 4 {
          %2 = affine.apply affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) floordiv 1024)>(%arg4, %arg5, %arg6, %arg3)
          %3 = affine.apply affine_map<(d0, d1, d2, d3) -> (((d0 + d1 + d2) mod 1024) floordiv 32)>(%arg4, %arg5, %arg6, %arg3)
          %4 = affine.apply affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) mod 32)>(%arg4, %arg5, %arg6, %arg3)
          %5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d3 floordiv 2)>(%arg4, %arg5, %arg6, %arg3)
          affine.store %cst, %0[(%arg4 + %arg5 + %arg6) floordiv 1024, ((%arg4 + %arg5 + %arg6) mod 1024) floordiv 32, (%arg4 + %arg5 + %arg6) mod 32, 0] : memref<1x32x32x1xf32, 3>
        }
        %1 = affine.apply affine_map<(d0, d1, d2) -> (d2 floordiv 2)>(%arg4, %arg5, %arg3)
        affine.for %arg6 = affine_map<(d0, d1) -> ((d0 + d1) floordiv 1024)>(%arg4, %arg5) to affine_map<(d0, d1) -> ((d0 + d1 + 3) floordiv 1024 + 1)>(%arg4, %arg5) {
          affine.for %arg7 = 0 to 32 {
            affine.for %arg8 = 0 to 32 {
              %2 = affine.load %0[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
              affine.store %2, %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
            }
          }
        }
        memref.dealloc %0 : memref<1x32x32x1xf32, 3>
      }
    }
  }
  return %arg2 : memref<1x32x32x32xf32>
}

#map0 = affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) floordiv 1024)>
#map1 = affine_map<(d0, d1, d2, d3) -> (((d0 + d1 + d2) mod 1024) floordiv 32)>
#map2 = affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) mod 32)>
#map3 = affine_map<(d0, d1, d2, d3) -> (d3 floordiv 2)>
#map4 = affine_map<(d0, d1, d2) -> (d2 floordiv 2)>
#map5 = affine_map<(d0, d1) -> ((d0 + d1) floordiv 1024)>
#map6 = affine_map<(d0, d1) -> ((d0 + d1 + 3) floordiv 1024 + 1)>
module  {
  func @test(%arg0: memref<1x35x35x4xf32>, %arg1: memref<4x4x4x32xf32>, %arg2: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %c0_0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    affine.for %arg3 = 0 to 64 step 2 {
      affine.for %arg4 = 0 to 1024 step 256 {
        affine.for %arg5 = 0 to 256 step 4 {
          %0 = memref.alloc() : memref<1x32x32x1xf32, 3>
          affine.for %arg6 = 0 to 4 {
            %2 = affine.apply #map0(%arg4, %arg5, %arg6, %arg3)
            %3 = affine.apply #map1(%arg4, %arg5, %arg6, %arg3)
            %4 = affine.apply #map2(%arg4, %arg5, %arg6, %arg3)
            %5 = affine.apply #map3(%arg4, %arg5, %arg6, %arg3)
            affine.store %cst, %0[(%arg4 + %arg5 + %arg6) floordiv 1024, ((%arg4 + %arg5 + %arg6) mod 1024) floordiv 32, (%arg4 + %arg5 + %arg6) mod 32, 0] : memref<1x32x32x1xf32, 3>
          }
          %1 = affine.apply #map4(%arg4, %arg5, %arg3)
          affine.for %arg6 = #map5(%arg4, %arg5) to #map6(%arg4, %arg5) {
            affine.for %arg7 = 0 to 32 {
              affine.for %arg8 = 0 to 32 {
                %2 = affine.load %0[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
                affine.store %2, %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
              }
            }
          }
          memref.dealloc %0 : memref<1x32x32x1xf32, 3>
        }
      }
    }
    return %arg2 : memref<1x32x32x32xf32>
  }
}

This is the generated DataCopy from promoted memory buffer to given buffer.
The copy should only write 8x32 memory items, not 32x32.
(for simplicity, I fill zero to the whole buffer, but in real case, if we are setting different value, then it would be wrong)

            affine.for %arg7 = 0 to 32 {
              affine.for %arg8 = 0 to 32 {
                %2 = affine.load %0[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
                affine.store %2, %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
              }
            }

I was referring to the earlier test case you posted where there were no stores – I don’t see anything wrong there yet. In the case of stores where the memref region is strided (happens when you use mod/floordiv or a non-unit stride) which is the second one you posted, the approximate regions are problematic – however, support for this should be easy to add now. Could you please create a bug/issue on LLVM github issues with a minimal simple test case? (the one you have above can be simplified further)

Sure. I will learn to open one issue, and with a simpler test.
Thanks.

one issue opened: AffineDataCopyGeneration generated Memory buf copy is not right if the buffer access is stride · Issue #54994 · llvm/llvm-project · GitHub

Just curious, when would you support this stride access? I am willing to try this. And I would continue my try based on this work. Thanks very much.

Uday,
Hello, would you plan to optimize the fast memory consumption, in case of stride access ?