Vector.transfer_write error

I am implementing optimizations for a conv2d_nhwc_hwcf operator. I plan to use vector.transfer_read inside an affine.for loop to read values along the W dimension and utilize vector.transfer_write to write them back. However, the vector.transfer_write operation produces an error when testing with mlir-cpu-runner . Below is my MLIR program:

#map0 = affine_map<(d0, d1, d2, d3) -> (d2)>
#map1 = affine_map<(d0) -> (d0 ceildiv 32)>
module {
  func.func private @printMemrefF32(memref<*xf32>)
  // Allocate and fill the memref according to the given layout.
  func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> memref<?x?x?x?xf32> {
    %c0 = arith.constant 0 : index
    %c1 = arith.constant 1 : index
    %0 = memref.alloc(%arg0, %arg1, %arg2, %arg3) : memref<?x?x?x?xf32>
    scf.for %arg5 = %c0 to %arg0 step %c1 {
      scf.for %arg6 = %c0 to %arg1 step %c1 {
        scf.for %arg7 = %c0 to %arg2 step %c1 {
          scf.for %arg8 = %c0 to %arg3 step %c1 {
            memref.store %arg4, %0[%arg5, %arg6, %arg7, %arg8] : memref<?x?x?x?xf32>
          }
        }
      }
    }
    return %0 : memref<?x?x?x?xf32>
  }

  // Convoluation implementation.
  func.func @conv_2d_nhwc_hwcf(%input: memref<?x?x?x?xf32>,
                               %kernel: memref<?x?x?x?xf32>,
                               %output: memref<?x?x?x?xf32>) {
    %c0 = arith.constant 0 : index
    %c0_f32 = arith.constant 0.0 : f32
    //%c0_f32_vec = vector.splat %c0_f32 : vector<32xf32>
    %c1 = arith.constant 1 : index
    %c2 = arith.constant 2 : index
    %c3 = arith.constant 3 : index
    %c32 = arith.constant 32 : index
    // Get the n size. (batch)
    %n = memref.dim %input, %c0 :  memref<?x?x?x?xf32>
    // Get the f size. (feature)
    %f = memref.dim %kernel, %c3 :  memref<?x?x?x?xf32>
    // Get the c size. (channel)
    %c = memref.dim %kernel, %c2 :  memref<?x?x?x?xf32>
    // Get the 2D output size. (row and column)
    %output_row = memref.dim %output, %c1 :  memref<?x?x?x?xf32>
    %output_col = memref.dim %output, %c2 :  memref<?x?x?x?xf32>
    // Get the 2D kernel size. (row and column)
    %kernel_row = memref.dim %kernel, %c0 :  memref<?x?x?x?xf32>
    %kernel_col = memref.dim %kernel, %c1 :  memref<?x?x?x?xf32>

    affine.for %n_idx = %c0 to %n {
      affine.for %output_row_idx = %c0 to %output_row {
        affine.for %kernel_row_idx = %c0 to %kernel_row {
          affine.for %kernel_col_idx = %c0 to %kernel_col {
            affine.for %output_col_idx = %c0 to #map1(%output_col) {
              affine.for %c_idx = %c0 to %c {
                affine.for %f_idx = %c0 to %f {
                  // Check sparsity.
                  %kernel_ele = memref.load %kernel[%kernel_row_idx, %kernel_col_idx, %c_idx, %f_idx] : memref<?x?x?x?xf32>
                  %sparsity_flag = arith.cmpf one, %kernel_ele, %c0_f32 : f32
                  scf.if %sparsity_flag {
                    // Check tail.
                    %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<32xf32>
                    %output_col_cur = arith.muli %output_col_idx, %c32 : index
                    %tail_len = arith.subi %output_col, %output_col_cur : index
                    %tail_flag = arith.cmpi sge, %tail_len, %c32 : index
                    %input_row_idx_tail = arith.addi %output_row_idx, %kernel_row_idx : index
                    %output_col_idx_tail = arith.muli %output_col_idx, %c32 : index
                    %input_col_idx_tail = arith.addi %kernel_col_idx, %output_col_idx_tail : index
                    %mask_vec = vector.create_mask %tail_len : vector<32xi1>
                    %input_vec_tail = vector.transfer_read %input[%n_idx, %input_row_idx_tail, %input_col_idx_tail, %c_idx], %c0_f32, %mask_vec {permutation_map = #map0, in_bounds = [false]} : memref<?x?x?x?xf32>, vector<32xf32>
                    %output_vec_tail = vector.transfer_read %output[%n_idx, %output_row_idx, %output_col_idx_tail, %f_idx], %c0_f32, %mask_vec {permutation_map = #map0, in_bounds = [false]} : memref<?x?x?x?xf32>, vector<32xf32>
                    %result_vec_tail = vector.fma %input_vec_tail, %kernel_vec, %output_vec_tail : vector<32xf32>
                    vector.transfer_write %result_vec_tail, %output[%n_idx, %output_row_idx, %output_col_idx_tail, %f_idx], %mask_vec {permutation_map = #map0, in_bounds = [false]} : vector<32xf32>, memref<?x?x?x?xf32>
                  }
                }
              }
            }
          }
        }
      }
    }
    return
  }

  func.func @main() {
    // Intput and kernel value.
    %cst = arith.constant 1.000000e+00 : f32
    // Output value.
    %cst_0 = arith.constant 0.000000e+00 : f32

    // Define layout.
    %input_n = arith.constant 1 : index
    %input_c = arith.constant 64 : index
    %input_h = arith.constant 58 : index
    %input_w = arith.constant 58 : index

    %kernel_f = arith.constant 64 : index
    %kernel_c = arith.constant 64 : index
    %kernel_h = arith.constant 3 : index
    %kernel_w = arith.constant 3 : index

    %output_n = arith.constant 1 : index
    %output_f = arith.constant 64 : index
    %output_h = arith.constant 56 : index
    %output_w = arith.constant 56 : index

    // Define input, kernel, and output memref.
    %input = call @alloc_f32(%input_n, %input_h, %input_w, %input_c, %cst) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
    %kernel = call @alloc_f32(%kernel_h, %kernel_w, %kernel_c, %kernel_f, %cst) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>
    %output = call @alloc_f32(%output_n, %output_h, %output_w, %output_f, %cst_0) : (index, index, index, index, f32) -> memref<?x?x?x?xf32>

    // Perform convolution
    call @conv_2d_nhwc_hwcf(%input, %kernel, %output) : (memref<?x?x?x?xf32>, memref<?x?x?x?xf32>, memref<?x?x?x?xf32>) -> ()

    // Print the output
    %print_output = memref.cast %output : memref<?x?x?x?xf32> to memref<*xf32>
    call @printMemrefF32(%print_output) : (memref<*xf32>) -> ()

    memref.dealloc %output : memref<?x?x?x?xf32>
    memref.dealloc %input : memref<?x?x?x?xf32>
    memref.dealloc %kernel : memref<?x?x?x?xf32>
    return
  }
}

The error message I received is shown in the following figure. The command I used to run the program is mlir-opt test_conv2d_nhwc_hwcf.mlir -convert-linalg-to-loops -convert-vector-to-scf -lower-affine -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | mlir-cpu-runner -e main -entry-point-result=void -shared-libs=libmlir_runner_utils.dylib -shared-libs=libmlir_c_runner_utils.dylib

After some debugging, I found that the program runs correctly if I simply comment out the following line of code: vector.transfer_write %result_vec_tail, %output[%n_idx, %output_row_idx, %output_col_idx_tail, %f_idx], %mask_vec {permutation_map = #map0, in_bounds = [false]} : vector<32xf32>, memref<?x?x?x?xf32> , After multiple tests, it was observed that the program also runs correctly when removing the dimensions f , c , and kernel_w . I hope you can help me resolve this issue as it is urgent.
Supplementing with a crucial piece of information, I eventually discovered that reducing the size of the input and convolution kernel allows the program to run normally. It’s possible that the transform_read operation is highly performance-intensive.