I encounter one issue when using Pass AffineDataCopyGeneration. Following is my case, it can pass generation, but the result buffer (copy buffer) is wrong and the buffer size is wrong.
Here is my case:
// RUN: mlir-opt %s -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=3 skip-non-unit-stride-loops" | FileCheck %s
// CHECK-LABEL: @test
func @test(%in0: memref<1x35x35x4xf32>, %in1: memref<4x4x4x32xf32>, %out: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
affine.for %arg0 = 0 to 64 step 2 {
affine.for %arg1 = 0 to 1024 step 256 {
affine.for %arg3 = 0 to 256 step 4 {
affine.for %arg4 = 0 to 4 {
%5 = affine.load %out[(%arg1 + %arg3 + %arg4) floordiv 1024, ((%arg1 + %arg3 + %arg4) mod 1024) floordiv 32, (%arg1 + %arg3 + %arg4) mod 32, %arg0 floordiv 2] : memref<1x32x32x32xf32>
}
}
}
}
return %out : memref<1x32x32x32xf32>
}
The command used:
mlir-opt datacopyfordivmod3.mlir -split-input-file -affine-data-copy-generate="generate-dma=false fast-mem-space=3 skip-non-unit-stride-loops" -print-ir-after-all -print-ir-before-all
The result log:
// -----// IR Dump Before AffineDataCopyGeneration //----- //
func @test(%arg0: memref<1x35x35x4xf32>, %arg1: memref<4x4x4x32xf32>, %arg2: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
affine.for %arg3 = 0 to 64 step 2 {
affine.for %arg4 = 0 to 1024 step 256 {
affine.for %arg5 = 0 to 256 step 4 {
affine.for %arg6 = 0 to 4 {
%0 = affine.load %arg2[(%arg4 + %arg5 + %arg6) floordiv 1024, ((%arg4 + %arg5 + %arg6) mod 1024) floordiv 32, (%arg4 + %arg5 + %arg6) mod 32, %arg3 floordiv 2] : memref<1x32x32x32xf32>
}
}
}
}
return %arg2 : memref<1x32x32x32xf32>
}
// -----// IR Dump After AffineDataCopyGeneration //----- //
func @test(%arg0: memref<1x35x35x4xf32>, %arg1: memref<4x4x4x32xf32>, %arg2: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_0 = arith.constant 0 : index
affine.for %arg3 = 0 to 64 step 2 {
affine.for %arg4 = 0 to 1024 step 256 {
affine.for %arg5 = 0 to 256 step 4 {
%0 = affine.apply affine_map<(d0, d1, d2) -> (d2 floordiv 2)>(%arg4, %arg5, %arg3)
%1 = memref.alloc() : memref<1x32x32x1xf32, 3>
affine.for %arg6 = affine_map<(d0, d1) -> ((d0 + d1) floordiv 1024)>(%arg4, %arg5) to affine_map<(d0, d1) -> ((d0 + d1 + 3) floordiv 1024 + 1)>(%arg4, %arg5) {
affine.for %arg7 = 0 to 32 {
affine.for %arg8 = 0 to 32 {
%2 = affine.load %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
affine.store %2, %1[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
}
}
}
affine.for %arg6 = 0 to 4 {
%2 = affine.apply affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) floordiv 1024)>(%arg4, %arg5, %arg6, %arg3)
%3 = affine.apply affine_map<(d0, d1, d2, d3) -> (((d0 + d1 + d2) mod 1024) floordiv 32)>(%arg4, %arg5, %arg6, %arg3)
%4 = affine.apply affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) mod 32)>(%arg4, %arg5, %arg6, %arg3)
%5 = affine.apply affine_map<(d0, d1, d2, d3) -> (d3 floordiv 2)>(%arg4, %arg5, %arg6, %arg3)
%6 = affine.load %1[(%arg4 + %arg5 + %arg6) floordiv 1024, ((%arg4 + %arg5 + %arg6) mod 1024) floordiv 32, (%arg4 + %arg5 + %arg6) mod 32, 0] : memref<1x32x32x1xf32, 3>
}
memref.dealloc %1 : memref<1x32x32x1xf32, 3>
}
}
}
return %arg2 : memref<1x32x32x32xf32>
}
#map0 = affine_map<(d0, d1, d2) -> (d2 floordiv 2)>
#map1 = affine_map<(d0, d1) -> ((d0 + d1) floordiv 1024)>
#map2 = affine_map<(d0, d1) -> ((d0 + d1 + 3) floordiv 1024 + 1)>
#map3 = affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) floordiv 1024)>
#map4 = affine_map<(d0, d1, d2, d3) -> (((d0 + d1 + d2) mod 1024) floordiv 32)>
#map5 = affine_map<(d0, d1, d2, d3) -> ((d0 + d1 + d2) mod 32)>
#map6 = affine_map<(d0, d1, d2, d3) -> (d3 floordiv 2)>
module {
func @test(%arg0: memref<1x35x35x4xf32>, %arg1: memref<4x4x4x32xf32>, %arg2: memref<1x32x32x32xf32>) -> memref<1x32x32x32xf32> {
%c1024 = arith.constant 1024 : index
%c0 = arith.constant 0 : index
%c0_0 = arith.constant 0 : index
affine.for %arg3 = 0 to 64 step 2 {
affine.for %arg4 = 0 to 1024 step 256 {
affine.for %arg5 = 0 to 256 step 4 {
%0 = affine.apply #map0(%arg4, %arg5, %arg3)
%1 = memref.alloc() : memref<1x32x32x1xf32, 3>
affine.for %arg6 = #map1(%arg4, %arg5) to #map2(%arg4, %arg5) {
affine.for %arg7 = 0 to 32 {
affine.for %arg8 = 0 to 32 {
%2 = affine.load %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
affine.store %2, %1[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
}
}
}
affine.for %arg6 = 0 to 4 {
%2 = affine.apply #map3(%arg4, %arg5, %arg6, %arg3)
%3 = affine.apply #map4(%arg4, %arg5, %arg6, %arg3)
%4 = affine.apply #map5(%arg4, %arg5, %arg6, %arg3)
%5 = affine.apply #map6(%arg4, %arg5, %arg6, %arg3)
%6 = affine.load %1[(%arg4 + %arg5 + %arg6) floordiv 1024, ((%arg4 + %arg5 + %arg6) mod 1024) floordiv 32, (%arg4 + %arg5 + %arg6) mod 32, 0] : memref<1x32x32x1xf32, 3>
}
memref.dealloc %1 : memref<1x32x32x1xf32, 3>
}
}
}
return %arg2 : memref<1x32x32x32xf32>
}
}
This is the generated copy LoopNest:
%0 = affine.apply #map0(%arg4, %arg5, %arg3)
%1 = memref.alloc() : memref<1x32x32x1xf32, 3>
affine.for %arg6 = #map1(%arg4, %arg5) to #map2(%arg4, %arg5) {
affine.for %arg7 = 0 to 32 {
affine.for %arg8 = 0 to 32 {
%2 = affine.load %arg2[%arg6, %arg7, %arg8, %arg3 floordiv 2] : memref<1x32x32x32xf32>
affine.store %2, %1[%arg6, %arg7, %arg8, 0] : memref<1x32x32x1xf32, 3>
}
}
the generated buffer shape is 1x32x32x1xf32, but it is wrong. From reading the code, it should be 1x8x32x1xf32.
Any one know the reason? I noticed one special part of my code is that floordiv and mod. Those unit tests in MLIR source code doesn’t use floordiv and mode.
Do floordiv and mod in mapping lead to error in AffineCopyDataGeneration? Is it supported ?