Thank you for the answer. I tried the following passes as suggested by you. In the last IR I still see memref, tensor etc. dialect as well. So, is it possible to have pure LLVM dialect IR? And, how to execute this graph. Thanks.
First, I ran ./mlir-opt --pass-pipeline="builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg))" ./graph.mlir
to get:
#map = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module attributes {torch.debug_module_name = "Model"} {
func.func @forward(%arg0: tensor<1x16x2x2xf32>, %arg1: tensor<1x16x2x2xf32>) -> tensor<1x16x2x2xf32> {
%0 = tensor.empty() : tensor<1x16x2x2xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0, %arg1 : tensor<1x16x2x2xf32>, tensor<1x16x2x2xf32>) outs(%0 : tensor<1x16x2x2xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%2 = arith.addf %in, %in_0 : f32
linalg.yield %2 : f32
} -> tensor<1x16x2x2xf32>
return %1 : tensor<1x16x2x2xf32>
}
}
Then, ./mlir-opt --one-shot-bufferize ./graph.mlir
to get:
#map = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module attributes {torch.debug_module_name = "Model"} {
func.func @forward(%arg0: tensor<1x16x2x2xf32>, %arg1: tensor<1x16x2x2xf32>) -> tensor<1x16x2x2xf32> {
%0 = bufferization.to_memref %arg1 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<1x16x2x2xf32>
linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %0 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>, memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>) outs(%alloc : memref<1x16x2x2xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%3 = arith.addf %in, %in_0 : f32
linalg.yield %3 : f32
}
%2 = bufferization.to_tensor %alloc : memref<1x16x2x2xf32>
return %2 : tensor<1x16x2x2xf32>
}
}
And lastly, ./mlir-opt --convert-to-llvm ./graph.mlir
to get:
#map = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module attributes {torch.debug_module_name = "Model"} {
llvm.func @malloc(i64) -> !llvm.ptr
func.func @forward(%arg0: tensor<1x16x2x2xf32>, %arg1: tensor<1x16x2x2xf32>) -> tensor<1x16x2x2xf32> {
%0 = bufferization.to_memref %arg1 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>
%1 = bufferization.to_memref %arg0 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>
%2 = llvm.mlir.constant(1 : index) : i64
%3 = llvm.mlir.constant(16 : index) : i64
%4 = llvm.mlir.constant(2 : index) : i64
%5 = llvm.mlir.constant(2 : index) : i64
%6 = llvm.mlir.constant(1 : index) : i64
%7 = llvm.mlir.constant(4 : index) : i64
%8 = llvm.mlir.constant(64 : index) : i64
%9 = llvm.mlir.constant(64 : index) : i64
%10 = llvm.mlir.zero : !llvm.ptr
%11 = llvm.getelementptr %10[%9] : (!llvm.ptr, i64) -> !llvm.ptr, f32
%12 = llvm.ptrtoint %11 : !llvm.ptr to i64
%13 = llvm.mlir.constant(64 : index) : i64
%14 = llvm.add %12, %13 : i64
%15 = llvm.call @malloc(%14) : (i64) -> !llvm.ptr
%16 = llvm.ptrtoint %15 : !llvm.ptr to i64
%17 = llvm.mlir.constant(1 : index) : i64
%18 = llvm.sub %13, %17 : i64
%19 = llvm.add %16, %18 : i64
%20 = llvm.urem %19, %13 : i64
%21 = llvm.sub %19, %20 : i64
%22 = llvm.inttoptr %21 : i64 to !llvm.ptr
%23 = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%24 = llvm.insertvalue %15, %23[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%25 = llvm.insertvalue %22, %24[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%26 = llvm.mlir.constant(0 : index) : i64
%27 = llvm.insertvalue %26, %25[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%28 = llvm.insertvalue %2, %27[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%29 = llvm.insertvalue %3, %28[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%30 = llvm.insertvalue %4, %29[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%31 = llvm.insertvalue %5, %30[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%32 = llvm.insertvalue %8, %31[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%33 = llvm.insertvalue %7, %32[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%34 = llvm.insertvalue %5, %33[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%35 = llvm.insertvalue %6, %34[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>
%36 = builtin.unrealized_conversion_cast %35 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> to memref<1x16x2x2xf32>
linalg.generic {indexing_maps = [#map, #map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%1, %0 : memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>, memref<1x16x2x2xf32, strided<[?, ?, ?, ?], offset: ?>>) outs(%36 : memref<1x16x2x2xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%38 = llvm.fadd %in, %in_0 : f32
linalg.yield %38 : f32
}
%37 = bufferization.to_tensor %36 : memref<1x16x2x2xf32>
return %37 : tensor<1x16x2x2xf32>
}
}