I am using the MatMul example to learn the GPU Lowering Pipeline. for a 1024x1024 MatMul, I tiled it to 128x128, and then to 16x16, which in turn mapped to Blocks and Threads, and in the end, I got the following mlir code:
#map = affine_map<(d0) -> (d0 * 128)>
#map1 = affine_map<(d0) -> (d0 * 16)>
module attributes {gpu.container_module} {
func.func @matmul(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> {
%c1024 = arith.constant 1024 : index
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c8 = arith.constant 8 : index
gpu.launch_func @matmul_kernel::@matmul_kernel blocks in (%c8, %c8, %c1) threads in (%c8, %c8, %c1) args(%arg0 : memref<1024x1024xf32>, %arg1 : memref<1024x1024xf32>, %arg2 : memref<1024x1024xf32>, %c0 : index, %c1024 : index, %c16 : index)
return %arg2 : memref<1024x1024xf32>
}
gpu.module @matmul_kernel {
gpu.func @matmul_kernel(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>, %arg3: index, %arg4: index, %arg5: index) kernel attributes {gpu.known_block_size = array<i32: 8, 8, 1>, gpu.known_grid_size = array<i32: 8, 8, 1>} {
cf.br ^bb1
^bb1: // pred: ^bb0
%0 = gpu.block_id x
%1 = gpu.block_id y
%2 = affine.apply #map(%0)
%3 = affine.apply #map(%1)
%subview = memref.subview %arg0[%2, 0] [128, 1024] [1, 1] : memref<1024x1024xf32> to memref<128x1024xf32, strided<[1024, 1], offset: ?>>
%subview_0 = memref.subview %arg1[0, %3] [1024, 128] [1, 1] : memref<1024x1024xf32> to memref<1024x128xf32, strided<[1024, 1], offset: ?>>
%subview_1 = memref.subview %arg2[%2, %3] [128, 128] [1, 1] : memref<1024x1024xf32> to memref<128x128xf32, strided<[1024, 1], offset: ?>>
%4 = gpu.thread_id x
%5 = gpu.thread_id y
%6 = affine.apply #map1(%4)
%7 = affine.apply #map1(%5)
%subview_2 = memref.subview %subview[%6, 0] [16, 1024] [1, 1] : memref<128x1024xf32, strided<[1024, 1], offset: ?>> to memref<16x1024xf32, strided<[1024, 1], offset: ?>>
%subview_3 = memref.subview %subview_0[0, %7] [1024, 16] [1, 1] : memref<1024x128xf32, strided<[1024, 1], offset: ?>> to memref<1024x16xf32, strided<[1024, 1], offset: ?>>
%subview_4 = memref.subview %subview_1[%6, %7] [16, 16] [1, 1] : memref<128x128xf32, strided<[1024, 1], offset: ?>> to memref<16x16xf32, strided<[1024, 1], offset: ?>>
%8 = gpu.subgroup_mma_load_matrix %subview_4[%arg3, %arg3] {leadDimension = 1024 : index} : memref<16x16xf32, strided<[1024, 1], offset: ?>> -> !gpu.mma_matrix<16x16xf32, "COp">
%9 = scf.for %arg6 = %arg3 to %arg4 step %arg5 iter_args(%arg7 = %8) -> (!gpu.mma_matrix<16x16xf32, "COp">) {
%subview_5 = memref.subview %subview_2[0, %arg6] [16, 16] [1, 1] : memref<16x1024xf32, strided<[1024, 1], offset: ?>> to memref<16x16xf32, strided<[1024, 1], offset: ?>>
%subview_6 = memref.subview %subview_3[%arg6, 0] [16, 16] [1, 1] : memref<1024x16xf32, strided<[1024, 1], offset: ?>> to memref<16x16xf32, strided<[1024, 1], offset: ?>>
%10 = gpu.subgroup_mma_load_matrix %subview_5[%arg3, %arg3] {leadDimension = 1024 : index} : memref<16x16xf32, strided<[1024, 1], offset: ?>> -> !gpu.mma_matrix<16x16xf32, "AOp">
%11 = gpu.subgroup_mma_load_matrix %subview_6[%arg3, %arg3] {leadDimension = 1024 : index} : memref<16x16xf32, strided<[1024, 1], offset: ?>> -> !gpu.mma_matrix<16x16xf32, "BOp">
%12 = gpu.subgroup_mma_compute %10, %11, %arg7 : !gpu.mma_matrix<16x16xf32, "AOp">, !gpu.mma_matrix<16x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
scf.yield %12 : !gpu.mma_matrix<16x16xf32, "COp">
}
gpu.subgroup_mma_store_matrix %9, %subview_4[%arg3, %arg3] {leadDimension = 1024 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<16x16xf32, strided<[1024, 1], offset: ?>>
memref.copy %subview_4, %subview_4 : memref<16x16xf32, strided<[1024, 1], offset: ?>> to memref<16x16xf32, strided<[1024, 1], offset: ?>>
gpu.barrier
memref.copy %subview_1, %subview_1 : memref<128x128xf32, strided<[1024, 1], offset: ?>> to memref<128x128xf32, strided<[1024, 1], offset: ?>>
gpu.return
}
}
}
When I use the command mlir-opt -gpu-lower-to-nvvm="cubin-chip=sm_75 cubin-features=+ptx75 opt-level=3"
to lower the above code, I get the following error:
error: failed to legalize operation 'scf.for' that was explicitly marked illegal
%9 = scf.for %arg6 = %arg3 to %arg4 step %arg5 iter_args(%arg7 = %8) -> (!gpu.mma_matrix<16x16xf32, "COp">) {
What is causing this error and what is the correct GPU Dialect Lowering process?