Hello,
Consider this (trivial) tensorflow code:
func @myfun(%x:tensor<2xf32>)->(tensor<2xf32>) {
%c = "tf.Const"(){value = dense<(3.000000e+00,4.0)>:tensor<2xf32>} : () -> tensor<2xf32>
%0 = "tf.AddV2"(%c, %x) {device = ""} : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
return %0 : tensor<2xf32>
}
Converting it to linalg
+std
using tf-opt --tf-to-hlo-pipeline --hlo-legalize-to-linalg --mhlo-legalize-to-std
produces the following code:
#map = affine_map<(d0) -> (d0)>
module {
func @myfun(%arg0: tensor<2xf32>) -> tensor<2xf32> {
%cst = constant dense<[3.000000e+00, 4.000000e+00]> : tensor<2xf32>
%0 = linalg.init_tensor [2] : tensor<2xf32>
%1 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%arg0, %cst : tensor<2xf32>, tensor<2xf32>) outs(%0 : tensor<2xf32>) {
^bb0(%arg1: f32, %arg2: f32, %arg3: f32): // no predecessors
%2 = addf %arg1, %arg2 : f32
linalg.yield %2 : f32
} -> tensor<2xf32>
return %1 : tensor<2xf32>
}
}
This code already has its quirks (the %0
tensor being initialized and given as input to linalg.generic
), but my true problem is that I cannot lower this to nice low-level code. My best result is the following:
global_memref "private" constant @__constant_2xf32 : memref<2xf32> = dense<[3.000000e+00, 4.000000e+00]>
func @myfun(%arg0: memref<2xf32>, %arg1: memref<2xf32>) {
%0 = alloc() : memref<1xf32>
%1 = get_global_memref @__constant_2xf32 : memref<2xf32>
%c0 = constant 0 : index
%c2 = constant 2 : index
%c1 = constant 1 : index
scf.for %arg2 = %c0 to %c2 step %c1 {
%2 = load %arg0[%arg2] : memref<2xf32>
%3 = load %1[%arg2] : memref<2xf32>
%4 = addf %2, %3 : f32
store %4, %0[%c0] : memref<1xf32>
%5 = load %0[%c0] : memref<1xf32>
store %5, %arg1[%arg2] : memref<2xf32>
}
dealloc %0 : memref<1xf32>
return
}
Note that a buffer is created, then deallocated, and data is copied around needlessly.
My question: Is there a pipeline of transformations that does not allocate memory and copy data around?
Best,
Dumitru