After Linalg tiling can't lower to llvm

Here is my mlir code, I don’t know how to properly lower to llvm ir and compile into object file

#map = affine_map<(d0, d1) -> (d0, d1)>
module {
  func.func @linalg_opt(%arg0: tensor<4096x4096xf32>, %arg1: tensor<4096x4096xf32>) -> tensor<4096x4096xf32> {
    %c4096 = arith.constant 4096 : index
    %c0 = arith.constant 0 : index
    %c512 = arith.constant 512 : index
    %0 = tensor.empty() : tensor<4096x4096xf32>
    %1 = scf.for %arg2 = %c0 to %c4096 step %c512 iter_args(%arg3 = %0) -> (tensor<4096x4096xf32>) {
      %2 = scf.for %arg4 = %c0 to %c4096 step %c512 iter_args(%arg5 = %arg3) -> (tensor<4096x4096xf32>) {
        %extracted_slice = tensor.extract_slice %arg0[%arg2, %arg4] [512, 512] [1, 1] : tensor<4096x4096xf32> to tensor<512x512xf32>
        %extracted_slice_0 = tensor.extract_slice %arg1[%arg2, %arg4] [512, 512] [1, 1] : tensor<4096x4096xf32> to tensor<512x512xf32>
        %extracted_slice_1 = tensor.extract_slice %arg5[%arg2, %arg4] [512, 512] [1, 1] : tensor<4096x4096xf32> to tensor<512x512xf32>
        %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%extracted_slice, %extracted_slice_0 : tensor<512x512xf32>, tensor<512x512xf32>) outs(%extracted_slice_1 : tensor<512x512xf32>) {
        ^bb0(%in: f32, %in_2: f32, %out: f32):
          %4 = arith.mulf %in, %in_2 : f32
          linalg.yield %4 : f32
        } -> tensor<512x512xf32>
        %inserted_slice = tensor.insert_slice %3 into %arg5[%arg2, %arg4] [512, 512] [1, 1] : tensor<512x512xf32> into tensor<4096x4096xf32>
        scf.yield %inserted_slice : tensor<4096x4096xf32>
      }
      scf.yield %2 : tensor<4096x4096xf32>
    }
    return %1 : tensor<4096x4096xf32>
  }
}

mlir-opt /tmp/foo.mlir  --one-shot-bufferize='bufferize-function-boundaries' -test-lower-to-llvm | mlir-translate -mlir-to-llvmir | clang -x ir -c


still error

Really? As of llvmorg-18-init-9519-g99c15eb49ba0 (Wed Oct 25 09:52:30 2023 +0200), your code from the original post compiles fine. Which version are you using?

Thanks for your reply, I am using an old version of LLVM16, I will update to the latest LLVM version