TOSA to LLVM MLIR

Hi all,

I have a tosa.mlir as follows and I want it to lower it down to the llvm ir only. Which passes should be used as from the mlir-opt? Thanks

module {
  func.func @main(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> {
    %0 = "tosa.const"() <{value = dense_resource<torch_tensor_1024_1024_torch.float32> : tensor<1024x1024xf32>}> : () -> tensor<1024x1024xf32>
    %1 = tosa.reshape %arg0 {new_shape = array<i64: 1, 1024, 1024>} : (tensor<1024x1024xf32>) -> tensor<1x1024x1024xf32>
    %2 = tosa.reshape %0 {new_shape = array<i64: 1, 1024, 1024>} : (tensor<1024x1024xf32>) -> tensor<1x1024x1024xf32>
    %3 = tosa.matmul %1, %2 : (tensor<1x1024x1024xf32>, tensor<1x1024x1024xf32>) -> tensor<1x1024x1024xf32>
    %4 = tosa.reshape %3 {new_shape = array<i64: 1024, 1024>} : (tensor<1x1024x1024xf32>) -> tensor<1024x1024xf32>
    return %4 : tensor<1024x1024xf32>
  }
}

Hello,

I’ve got a lowering path on my side that I can refer you to. You can use mlir-opt lower tosa to linalg, then you may need run --one-shot-bufferize pass. After the bufferization, you can lower the linalg to llvm dialect. Finally, you can translate the llvm dialect of mlir to llvm ir. One of my previous articles(https://zhuanlan.zhihu.com/p/703073038) in Chinese may help you.

Continue to ask any questions below!

sheen

1 Like

Another useful link that might help you.

sheen

Thank you for the answer. I tried the following passes as suggested by you. In the last IR I still see memref, tensor etc. dialect as well. So, is it possible to have pure LLVM dialect IR? And, how to execute this graph. Thanks.

First, I ran ./mlir-opt --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" ./graph.mlir to get:

#map = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module attributes {torch.debug_module_name = "Model"} {
  func.func @forward(%arg0: tensor<1x16x2x2xf32>, %arg1: tensor<1x16x2x2xf32>) -> tensor<1x16x2x2xf32> {
    %0 = tensor.empty() : tensor<1x16x2x2xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x16x2x2xf32>, tensor<1x16x2x2xf32>) outs(%0 : tensor<1x16x2x2xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %2 = arith.addf %in, %in_0 : f32
      linalg.yield %2 : f32
    } -> tensor<1x16x2x2xf32>
    return %1 : tensor<1x16x2x2xf32>
  }
}

Then, ./mlir-opt --one-shot-bufferize ./graph.mlir to get:

#map = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module attributes {torch.debug_module_name = "Model"} {
  func.func @forward(%arg0: tensor<1x16x2x2xf32>, %arg1: tensor<1x16x2x2xf32>) -> tensor<1x16x2x2xf32> {
    %0 = bufferization.to_memref %arg1 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>
    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x16x2x2xf32>
    linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %0 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>, memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>) outs(%alloc : memref<1x16x2x2xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %3 = arith.addf %in, %in_0 : f32
      linalg.yield %3 : f32
    }
    %2 = bufferization.to_tensor %alloc : memref<1x16x2x2xf32>
    return %2 : tensor<1x16x2x2xf32>
  }
}

And lastly, ./mlir-opt --convert-to-llvm ./graph.mlir to get:

#map = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module attributes {torch.debug_module_name = "Model"} {
  llvm.func @malloc(i64) -> !llvm.ptr
  func.func @forward(%arg0: tensor<1x16x2x2xf32>, %arg1: tensor<1x16x2x2xf32>) -> tensor<1x16x2x2xf32> {
    %0 = bufferization.to_memref %arg1 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>
    %1 = bufferization.to_memref %arg0 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>
    %2 = llvm.mlir.constant(1 : index) : i64
    %3 = llvm.mlir.constant(16 : index) : i64
    %4 = llvm.mlir.constant(2 : index) : i64
    %5 = llvm.mlir.constant(2 : index) : i64
    %6 = llvm.mlir.constant(1 : index) : i64
    %7 = llvm.mlir.constant(4 : index) : i64
    %8 = llvm.mlir.constant(64 : index) : i64
    %9 = llvm.mlir.constant(64 : index) : i64
    %10 = llvm.mlir.zero : !llvm.ptr
    %11 = llvm.getelementptr %10[%9] : (!llvm.ptr, i64) -> !llvm.ptr, f32
    %12 = llvm.ptrtoint %11 : !llvm.ptr to i64
    %13 = llvm.mlir.constant(64 : index) : i64
    %14 = llvm.add %12, %13 : i64
    %15 = llvm.call @malloc(%14) : (i64) -> !llvm.ptr
    %16 = llvm.ptrtoint %15 : !llvm.ptr to i64
    %17 = llvm.mlir.constant(1 : index) : i64
    %18 = llvm.sub %13, %17 : i64
    %19 = llvm.add %16, %18 : i64
    %20 = llvm.urem %19, %13  : i64
    %21 = llvm.sub %19, %20 : i64
    %22 = llvm.inttoptr %21 : i64 to !llvm.ptr
    %23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %24 = llvm.insertvalue %15, %23[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %25 = llvm.insertvalue %22, %24[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %26 = llvm.mlir.constant(0 : index) : i64
    %27 = llvm.insertvalue %26, %25[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %28 = llvm.insertvalue %2, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %29 = llvm.insertvalue %3, %28[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %30 = llvm.insertvalue %4, %29[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %31 = llvm.insertvalue %5, %30[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %32 = llvm.insertvalue %8, %31[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %33 = llvm.insertvalue %7, %32[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %34 = llvm.insertvalue %5, %33[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %35 = llvm.insertvalue %6, %34[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
    %36 = builtin.unrealized_conversion_cast %35 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> to memref<1x16x2x2xf32>
    linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %0 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>, memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>) outs(%36 : memref<1x16x2x2xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %38 = llvm.fadd %in, %in_0  : f32
      linalg.yield %38 : f32
    }
    %37 = bufferization.to_tensor %36 : memref<1x16x2x2xf32>
    return %37 : tensor<1x16x2x2xf32>
  }
}

When executing these passes you need to add increased secondary options to be able to do so, such as including the func’s arguments when conversion(lowering), etc.

In more detail, the --one-shot-bufferize=‘bufferize-function-boundaries’ command can help you solve your problem.

1 Like