Question: convert vector to mma ops

Hi,everyone!

I tried to use convert-vector-to-gpu pass, but it didn’t work. Can anyone give me some advice on how to properly generate mma ops?

Here is my input IR and pipeline:

func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
  %0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x6xf32>)  -> (tensor<1x5x6xf32>)
  return %0 : tensor<1x5x6xf32>
}
mlir-opt input.mlir \
-pass-pipeline="builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg))"  \
|mlir-opt  -linalg-bufferize -convert-linalg-to-affine-loops \
-affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=2,0" \
|mlir-opt  -pass-pipeline="builtin.module(func.func(convert-vector-to-gpu{use-nvgpu=true}))"\
-o  output.mlir

Thank you very much in advance!

In my case the problem is in the last step: vector.transfer_write is successfully generated, but cannot be further lower to mma ops by convert-vector-to-gpu.

#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, 0)>
module {
  func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> tensor<1x5x6xf32> {
    %cst = arith.constant dense<0.000000e+00> : vector<32x32xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %0 = bufferization.to_memref %arg1 : memref<1x3x6xf32>
    %1 = bufferization.to_memref %arg0 : memref<1x5x3xf32>
    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x5x6xf32>
    affine.for %arg2 = 0 to 1 step 32 {
      affine.for %arg3 = 0 to 5 {
        affine.for %arg4 = 0 to 6 step 32 {
          vector.transfer_write %cst, %alloc[%arg2, %arg3, %arg4] {permutation_map = #map} : vector<32x32xf32>, memref<1x5x6xf32>
        }
      }
    }
    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x5x6xf32>
    memref.copy %alloc, %alloc_1 : memref<1x5x6xf32> to memref<1x5x6xf32>
    affine.for %arg2 = 0 to 1 step 32 {
      affine.for %arg3 = 0 to 5 {
        affine.for %arg4 = 0 to 6 step 32 {
          affine.for %arg5 = 0 to 3 {
            %3 = vector.transfer_read %1[%arg2, %arg3, %arg5], %cst_0 {permutation_map = #map1} : memref<1x5x3xf32>, vector<32x32xf32>
            %4 = vector.transfer_read %0[%arg2, %arg5, %arg4], %cst_0 {permutation_map = #map} : memref<1x3x6xf32>, vector<32x32xf32>
            %5 = vector.transfer_read %alloc_1[%arg2, %arg3, %arg4], %cst_0 {permutation_map = #map} : memref<1x5x6xf32>, vector<32x32xf32>
            %6 = arith.mulf %3, %4 : vector<32x32xf32>
            %7 = arith.addf %5, %6 : vector<32x32xf32>
            vector.transfer_write %7, %alloc_1[%arg2, %arg3, %arg4] {permutation_map = #map} : vector<32x32xf32>, memref<1x5x6xf32>
          }
        }
      }
    }
    %2 = bufferization.to_tensor %alloc_1 : memref<1x5x6xf32>
    return %2 : tensor<1x5x6xf32>
  }
}