Hi,everyone!
I tried to use convert-vector-to-gpu
pass, but it didn’t work. Can anyone give me some advice on how to properly generate mma ops?
Here is my input IR and pipeline:
func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
%0 = "tosa.matmul"(%arg0, %arg1) : (tensor<1x5x3xf32>, tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>)
return %0 : tensor<1x5x6xf32>
}
mlir-opt input.mlir \
-pass-pipeline="builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg))" \
|mlir-opt -linalg-bufferize -convert-linalg-to-affine-loops \
-affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=2,0" \
|mlir-opt -pass-pipeline="builtin.module(func.func(convert-vector-to-gpu{use-nvgpu=true}))"\
-o output.mlir
Thank you very much in advance!
In my case the problem is in the last step: vector.transfer_write
is successfully generated, but cannot be further lower to mma ops by convert-vector-to-gpu
.
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d0, 0)>
module {
func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> tensor<1x5x6xf32> {
%cst = arith.constant dense<0.000000e+00> : vector<32x32xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%0 = bufferization.to_memref %arg1 : memref<1x3x6xf32>
%1 = bufferization.to_memref %arg0 : memref<1x5x3xf32>
%alloc = memref.alloc() {alignment = 64 : i64} : memref<1x5x6xf32>
affine.for %arg2 = 0 to 1 step 32 {
affine.for %arg3 = 0 to 5 {
affine.for %arg4 = 0 to 6 step 32 {
vector.transfer_write %cst, %alloc[%arg2, %arg3, %arg4] {permutation_map = #map} : vector<32x32xf32>, memref<1x5x6xf32>
}
}
}
%alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x5x6xf32>
memref.copy %alloc, %alloc_1 : memref<1x5x6xf32> to memref<1x5x6xf32>
affine.for %arg2 = 0 to 1 step 32 {
affine.for %arg3 = 0 to 5 {
affine.for %arg4 = 0 to 6 step 32 {
affine.for %arg5 = 0 to 3 {
%3 = vector.transfer_read %1[%arg2, %arg3, %arg5], %cst_0 {permutation_map = #map1} : memref<1x5x3xf32>, vector<32x32xf32>
%4 = vector.transfer_read %0[%arg2, %arg5, %arg4], %cst_0 {permutation_map = #map} : memref<1x3x6xf32>, vector<32x32xf32>
%5 = vector.transfer_read %alloc_1[%arg2, %arg3, %arg4], %cst_0 {permutation_map = #map} : memref<1x5x6xf32>, vector<32x32xf32>
%6 = arith.mulf %3, %4 : vector<32x32xf32>
%7 = arith.addf %5, %6 : vector<32x32xf32>
vector.transfer_write %7, %alloc_1[%arg2, %arg3, %arg4] {permutation_map = #map} : vector<32x32xf32>, memref<1x5x6xf32>
}
}
}
}
%2 = bufferization.to_tensor %alloc_1 : memref<1x5x6xf32>
return %2 : tensor<1x5x6xf32>
}
}