Indeed. 
Sure, will do by the end of the week.
With such sequence :
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%module : !transform.any_op ) {
%matmul = transform.structured.match ops{["linalg.matmul_transpose_a"]} in %module
: (!transform.any_op) -> !transform.any_op
// Tile to SVL (vscale = 2)
%tiled_linalg_op_0, %loops_1:3 = transform.structured.tile_using_for %matmul[8, 8, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
// Step 1: Tile for size [4] x [4], which corresponds to SVLs x SVLs, where
// SVLs is the number of 32-bit elements in a vector of SVL bits.
%tiled_linalg_op, %loop_1, %loop_2, %loop_3 = transform.structured.tile_using_for %tiled_linalg_op_0[[4], [4], 1]
: (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.any_op)
// Peel inner loop first then outer loop to remove masks in main loop
%inner_main_loop, %inner_remainder_loop = transform.loop.peel %loop_2 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">, !transform.op<"scf.for">)
%outer_main_loop, %outer_remainder_loop = transform.loop.peel %loop_1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">, !transform.op<"scf.for">)
// Step 2: Vectorize.
%matmulpeeled = transform.structured.match ops{["linalg.matmul_transpose_a"]} in %module : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %matmulpeeled vector_sizes [[4], [4], 1]
: !transform.any_op
%func = transform.structured.match ops{["func.func"]} in %module
: (!transform.any_op) -> !transform.any_op
// Step 3: Lower vector.multi_reduction to vector.contract (+ some helpful patterns).
transform.apply_patterns to %func {
// Add canonicalization to std pipeline, we lose contracts but generate peeled loops.
transform.apply_patterns.canonicalization
transform.apply_patterns.vector.lower_masked_transfers
transform.apply_patterns.vector.transfer_permutation_patterns
transform.apply_patterns.vector.reduction_to_contract
} : !transform.any_op
// Step 4: Lower vector.contract to vector.outerproduct.
transform.apply_patterns to %func {
transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
transform.apply_patterns.vector.lower_masks
} : !transform.any_op
transform.yield
}
}
You have outer tiles of SVL and peeled loops. We lost the contraction because of the canonicalization required by peeling. BTW canonicalizing before vectorize breaks because it goes inside MaskOp again.
#map = affine_map<(d0)[s0] -> (-d0 + s0, 8)>
#map1 = affine_map<()[s0, s1] -> (s0 - s0 mod s1)>
#map2 = affine_map<(d0)[s0] -> (-d0 + s0)>
module {
func.func @matmul_transpose_a(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) {
%cst = arith.constant 0.000000e+00 : f32
%c4 = arith.constant 4 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%dim = tensor.dim %arg0, %c0 : tensor<?x?xf32>
%dim_0 = tensor.dim %arg0, %c1 : tensor<?x?xf32>
%dim_1 = tensor.dim %arg1, %c1 : tensor<?x?xf32>
// 3 outer tiles [8, 8, 1]
%0 = scf.for %arg3 = %c0 to %dim_0 step %c8 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
%1 = scf.for %arg5 = %c0 to %dim_1 step %c8 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
%2 = scf.for %arg7 = %c0 to %dim step %c1 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
%3 = affine.min #map(%arg3)[%dim_0]
%4 = affine.min #map(%arg5)[%dim_1]
%extracted_slice = tensor.extract_slice %arg0[%arg7, %arg3] [1, %3] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_2 = tensor.extract_slice %arg1[%arg7, %arg5] [1, %4] [1, 1] : tensor<?x?xf32> to tensor<1x?xf32>
%extracted_slice_3 = tensor.extract_slice %arg8[%arg3, %arg5] [%3, %4] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%vscale = vector.vscale
%c4_vscale = arith.muli %vscale, %c4 : index
%vscale_4 = vector.vscale
%c4_vscale_5 = arith.muli %vscale_4, %c4 : index
%5 = affine.apply #map1()[%3, %c4_vscale]
// Outer scalable loop
%6 = scf.for %arg9 = %c0 to %5 step %c4_vscale iter_args(%arg10 = %extracted_slice_3) -> (tensor<?x?xf32>) {
%8 = affine.apply #map1()[%4, %c4_vscale_5]
// Main loop
%9 = scf.for %arg11 = %c0 to %8 step %c4_vscale_5 iter_args(%arg12 = %arg10) -> (tensor<?x?xf32>) {
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, %arg9] [1, %c4_vscale] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%extracted_slice_7 = tensor.extract_slice %extracted_slice_2[0, %arg11] [1, %c4_vscale_5] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%extracted_slice_8 = tensor.extract_slice %arg12[%arg9, %arg11] [%c4_vscale, %c4_vscale_5] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%11 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x?xf32>, vector<1x[4]xf32>
%12 = vector.broadcast %11 : vector<1x[4]xf32> to vector<[4]x1x[4]xf32>
%13 = vector.transpose %12, [2, 0, 1] : vector<[4]x1x[4]xf32> to vector<[4]x[4]x1xf32>
%14 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x?xf32>, vector<1x[4]xf32>
%15 = vector.broadcast %14 : vector<1x[4]xf32> to vector<[4]x1x[4]xf32>
%16 = vector.transpose %15, [0, 2, 1] : vector<[4]x1x[4]xf32> to vector<[4]x[4]x1xf32>
%17 = vector.transfer_read %extracted_slice_8[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<?x?xf32>, vector<[4]x[4]xf32>
%18 = arith.mulf %13, %16 : vector<[4]x[4]x1xf32>
%19 = vector.constant_mask [4, 4] : vector<[4]x[4]xi1>
%20 = vector.shape_cast %18 : vector<[4]x[4]x1xf32> to vector<[4]x[4]xf32>
%21 = arith.addf %17, %20 : vector<[4]x[4]xf32>
%22 = arith.select %19, %21, %20 : vector<[4]x[4]xi1>, vector<[4]x[4]xf32>
%23 = vector.transfer_write %22, %extracted_slice_8[%c0, %c0] {in_bounds = [true, true]} : vector<[4]x[4]xf32>, tensor<?x?xf32>
%inserted_slice_9 = tensor.insert_slice %23 into %arg12[%arg9, %arg11] [%c4_vscale, %c4_vscale_5] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice_9 : tensor<?x?xf32>
}
// peeled innerloop
%10 = scf.for %arg11 = %8 to %4 step %c4_vscale_5 iter_args(%arg12 = %9) -> (tensor<?x?xf32>) {
%11 = affine.apply #map2(%arg11)[%4]
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, %arg9] [1, %c4_vscale] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%extracted_slice_7 = tensor.extract_slice %extracted_slice_2[0, %arg11] [1, %11] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%extracted_slice_8 = tensor.extract_slice %arg12[%arg9, %arg11] [%c4_vscale, %11] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%12 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<1x?xf32>, vector<1x[4]xf32>
%13 = vector.broadcast %12 : vector<1x[4]xf32> to vector<[4]x1x[4]xf32>
%14 = vector.transpose %13, [2, 0, 1] : vector<[4]x1x[4]xf32> to vector<[4]x[4]x1xf32>
%15 = vector.create_mask %11 : vector<[4]xi1>
%16 = vector.broadcast %15 : vector<[4]xi1> to vector<1x[4]xi1>
%17 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst, %16 {in_bounds = [true, true]} : tensor<1x?xf32>, vector<1x[4]xf32>
%18 = vector.broadcast %17 : vector<1x[4]xf32> to vector<[4]x1x[4]xf32>
%19 = vector.transpose %18, [0, 2, 1] : vector<[4]x1x[4]xf32> to vector<[4]x[4]x1xf32>
%20 = vector.create_mask %c4_vscale, %11 : vector<[4]x[4]xi1>
%21 = vector.transfer_read %extracted_slice_8[%c0, %c0], %cst, %20 {in_bounds = [true, true]} : tensor<?x?xf32>, vector<[4]x[4]xf32>
%22 = arith.mulf %14, %19 : vector<[4]x[4]x1xf32>
%23 = vector.create_mask %c4_vscale, %11 : vector<[4]x[4]xi1>
%24 = vector.shape_cast %22 : vector<[4]x[4]x1xf32> to vector<[4]x[4]xf32>
%25 = arith.addf %21, %24 : vector<[4]x[4]xf32>
%26 = arith.select %23, %25, %24 : vector<[4]x[4]xi1>, vector<[4]x[4]xf32>
%27 = vector.transfer_write %26, %extracted_slice_8[%c0, %c0], %20 {in_bounds = [true, true]} : vector<[4]x[4]xf32>, tensor<?x?xf32>
%inserted_slice_9 = tensor.insert_slice %27 into %arg12[%arg9, %arg11] [%c4_vscale, %11] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice_9 : tensor<?x?xf32>
}
scf.yield %10 : tensor<?x?xf32>
}
// Peeled outer loop
%7 = scf.for %arg9 = %5 to %3 step %c4_vscale iter_args(%arg10 = %6) -> (tensor<?x?xf32>) {
%8 = affine.apply #map1()[%4, %c4_vscale_5]
%9 = scf.for %arg11 = %c0 to %8 step %c4_vscale_5 iter_args(%arg12 = %arg10) -> (tensor<?x?xf32>) {
%11 = affine.apply #map2(%arg9)[%3]
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, %arg9] [1, %11] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%extracted_slice_7 = tensor.extract_slice %extracted_slice_2[0, %arg11] [1, %c4_vscale_5] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%extracted_slice_8 = tensor.extract_slice %arg12[%arg9, %arg11] [%11, %c4_vscale_5] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%12 = linalg.matmul_transpose_a ins(%extracted_slice_6, %extracted_slice_7 : tensor<1x?xf32>, tensor<1x?xf32>) outs(%extracted_slice_8 : tensor<?x?xf32>) -> tensor<?x?xf32>
%inserted_slice_9 = tensor.insert_slice %12 into %arg12[%arg9, %arg11] [%11, %c4_vscale_5] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice_9 : tensor<?x?xf32>
}
%10 = scf.for %arg11 = %8 to %4 step %c4_vscale_5 iter_args(%arg12 = %9) -> (tensor<?x?xf32>) {
%11 = affine.apply #map2(%arg9)[%3]
%12 = affine.apply #map2(%arg11)[%4]
%extracted_slice_6 = tensor.extract_slice %extracted_slice[0, %arg9] [1, %11] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%extracted_slice_7 = tensor.extract_slice %extracted_slice_2[0, %arg11] [1, %12] [1, 1] : tensor<1x?xf32> to tensor<1x?xf32>
%extracted_slice_8 = tensor.extract_slice %arg12[%arg9, %arg11] [%11, %12] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
%13 = linalg.matmul_transpose_a ins(%extracted_slice_6, %extracted_slice_7 : tensor<1x?xf32>, tensor<1x?xf32>) outs(%extracted_slice_8 : tensor<?x?xf32>) -> tensor<?x?xf32>
%inserted_slice_9 = tensor.insert_slice %13 into %arg12[%arg9, %arg11] [%11, %12] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice_9 : tensor<?x?xf32>
}
scf.yield %10 : tensor<?x?xf32>
}
%inserted_slice = tensor.insert_slice %7 into %arg8[%arg3, %arg5] [%3, %4] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
scf.yield %inserted_slice : tensor<?x?xf32>
}
scf.yield %2 : tensor<?x?xf32>
}
scf.yield %1 : tensor<?x?xf32>
}
%cast = tensor.cast %0 : tensor<?x?xf32> to tensor<*xf32>
call @printMemrefF32(%cast) : (tensor<*xf32>) -> ()
return
}
module attributes {transform.with_named_sequence} {
}
func.func private @printMemrefF32(tensor<*xf32>)
}
if you replace vscale in place by %vscale = arith.constant 2 : index
The peeled loops should disappear as there is no remainder and and the main loop should be only one iteration. I suspect the constant propagation is lost because of the affinemap depending on the outer tile position.
I think the goal is to generate an outerproduct out of those arith.addf
+ arith.mulf
because we are in 1D and it will be easier than creating matmuls and we dont want to “lift” as you mentionned. The important part of the pattern should lie in the broadcast beforehand (otherwise it is just an elementwise fmuladd). There should be transposition inserted along the way which will make it a bit more cumbersome.
If we consider the matmuls have been transposed to fit in the matmul_transpose_A usecase, we have 2 1D vectors A and B of vector<[4]xf32>
. multiplied and arith.addf
on vector<[4]x[4]xf32>
in an accumulator C. We want the result to be something like that.
$\forall i,j $ $C_i,_j$ += $A_i* B_j$
That is broadcasting A “horizontally” (aka broadcast(transpose(A))) (might be more efficient to broadcast the transposed but not sure how hardware can represent that.) and broadcast B “vertically”.
In our case, A and B are 2D vector with a dim of 1. Can we ignore them and calle then “1D-like” ?
Basically inline matching would be
addf(mulf(transpose(broadcast(a), broadcast(b)), C)$) : vector<nxkxfXX>
And take care of commutativity.
If I’m not mistaken, it is not dependent on shapes of any elements other than A and B must be “1D-like” and that broadcasted vectors and C have the same type which should already be checked by verifier.