Here is my code.
func.func @main() {
%cst0 = arith.constant 0 : index
%cst1 = arith.constant 1 : index
%cst2 = arith.constant 2 : index
%cst3 = arith.constant 3 : index
%cst10 = arith.constant 10 : index
%cst32 = arith.constant 32 : index
%cst16 = arith.constant 16 : index
%f0 = arith.constant 0.0 : f16
%f1 = arith.constant 1.0 : f16
%f2 = arith.constant 2.0 : f16
%f2f32 = arith.constant 2.0 : f32
%input0 = memref.alloc() : memref<3x3xf16>
%input1 = memref.alloc() : memref<3x3xf16>
%output0 = memref.alloc() : memref<3x3xf32>
%input_cast0 = memref.cast %input0 : memref<3x3xf16> to memref<*xf16>
%input_cast1 = memref.cast %input1 : memref<3x3xf16> to memref<*xf16>
%output_cast0 = memref.cast %output0 : memref<3x3xf32> to memref<*xf32>
scf.for %i = %cst0 to %cst3 step %cst1 {
scf.for %j = %cst0 to %cst3 step %cst1 {
memref.store %f2, %input0[%i, %j] : memref<3x3xf16>
}
}
scf.for %i = %cst0 to %cst3 step %cst1 {
scf.for %j = %cst0 to %cst3 step %cst1 {
memref.store %f2, %input1[%i, %j] : memref<3x3xf16>
}
}
scf.for %i = %cst0 to %cst3 step %cst1 {
scf.for %j = %cst0 to %cst3 step %cst1 {
memref.store %f2f32, %output0[%i, %j] : memref<3x3xf32>
}
}
call @printMemrefF32(%output_cast0) : (memref<*xf32>) -> ()
gpu.host_register %input_cast0 : memref<*xf16>
gpu.host_register %input_cast1 : memref<*xf16>
gpu.host_register %output_cast0 : memref<*xf32>
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %cst1, %grid_y = %cst1, %grid_z = %cst1)
threads(%tx, %ty, %tz) in (%block_x = %cst32, %block_y = %cst1, %block_z = %cst1) {
%A = gpu.subgroup_mma_load_matrix %input0[%cst0, %cst0] {leadDimension = 3 : index} : memref<3x3xf16> -> !gpu.mma_matrix<16x16xf16, "AOp">
%B = gpu.subgroup_mma_load_matrix %input1[%cst0, %cst0] {leadDimension = 3 : index} : memref<3x3xf16> -> !gpu.mma_matrix<16x16xf16, "BOp">
%C = gpu.subgroup_mma_load_matrix %output0[%cst0, %cst0] {leadDimension = 3 : index} : memref<3x3xf32> -> !gpu.mma_matrix<16x16xf32, "COp">
%D = gpu.subgroup_mma_compute %A, %B, %C : !gpu.mma_matrix<16x16xf16,"AOp">, !gpu.mma_matrix<16x16xf16, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
gpu.subgroup_mma_store_matrix %D, %output0[%cst0, %cst0] {leadDimension = 3 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<3x3xf32>
gpu.terminator
}
call @printMemrefF32(%output_cast0) : (memref<*xf32>) -> ()
memref.dealloc %input0 : memref<3x3xf16>
memref.dealloc %input1 : memref<3x3xf16>
memref.dealloc %output0 : memref<3x3xf32>
return
}
func.func private @printMemrefF32(%ptr : memref<*xf32>)
When I run the code use mlir-cpu-runner,show that
Unranked Memref base@ = 0x5627f120d310 rank = 2 offset = 0 sizes = [3, 3] strides = [3, 1] data =
[[2, 2, 2],
[2, 2, 2],
[2, 2, 2]]
Unranked Memref base@ = 0x5627f120d310 rank = 2 offset = 0 sizes = [3, 3] strides = [3, 1] data =
[[22.0599, -20670.1, 214.967],
[-20674, 211.054, -41366.1],
[403.841, -41258, 364.34]]
free(): invalid next size (fast)
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
Stack dump:
0. Program arguments: ../../llvm/build/bin/mlir-cpu-runner -entry-point-result=void -shared-libs=../../llvm/build/lib/libmlir_runner_utils.so -shared-libs=../../llvm/build/lib/libmlir_cuda_runtime.so -shared-libs=../../llvm/build/lib/libmlir_async_runtime.so
#0 0x00005627ed567154 PrintStackTraceSignalHandler(void*) Signals.cpp:0:0
#1 0x00005627ed56433b SignalHandler(int) Signals.cpp:0:0
#2 0x00007f6f54769a00 (/usr/lib/libc.so.6+0x38a00)
#3 0x00007f6f547b949c (/usr/lib/libc.so.6+0x8849c)
#4 0x00007f6f54769958 raise (/usr/lib/libc.so.6+0x38958)
#5 0x00007f6f5475353d abort (/usr/lib/libc.so.6+0x2253d)
#6 0x00007f6f547ad63e (/usr/lib/libc.so.6+0x7c63e)
#7 0x00007f6f547c322c (/usr/lib/libc.so.6+0x9222c)
#8 0x00007f6f547c515a (/usr/lib/libc.so.6+0x9415a)
#9 0x00007f6f547c79f3 cfree (/usr/lib/libc.so.6+0x969f3)
#10 0x00007f6f5629743e
#11 0x00007f6f5629745d
#12 0x00005627eda6fa5c compileAndExecute((anonymous namespace)::Options&, mlir::ModuleOp, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig, void**) JitRunner.cpp:0:0
#13 0x00005627eda6ffd1 compileAndExecuteVoidFunction((anonymous namespace)::Options&, mlir::ModuleOp, llvm::StringRef, (anonymous namespace)::CompileAndExecuteConfig) JitRunner.cpp:0:0
#14 0x00005627eda7401b mlir::JitRunnerMain(int, char**, mlir::DialectRegistry const&, mlir::JitRunnerConfig) (../../llvm/build/bin/mlir-cpu-runner+0x90901b)
#15 0x00005627ed4d40fc main (../../llvm/build/bin/mlir-cpu-runner+0x3690fc)
#16 0x00007f6f54754290 (/usr/lib/libc.so.6+0x23290)
#17 0x00007f6f5475434a __libc_start_main (/usr/lib/libc.so.6+0x2334a)
#18 0x00005627ed5504f5 _start /build/glibc/src/glibc/csu/../sysdeps/x86_64/start.S:117:0
make: *** [makefile:62ļ¼gpu-mma-run] é误 134
I donāt know what the problem is.I have some question, i hope someone can help me,thanks!
- gpu.mma_matrix
I guess the dimension of gpu.mma_matrix can only be 16x16,because I tried all the others failed, but Iām not so sure. - the attribute leadDimension
Although I have read the official mlir documentation, I still have trouble understanding the meaning of leadDimension. - the mma operation should be nested in gpu.launch?
llvm-project/mlir/test/Integration/GPU/CUDA/TensorCore at main Ā· llvm/llvm-project Ā· GitHub, i test using mma operation operation but donāt use gpu.launch, but i canāt lower mma operation,but I still want to ask this question.
I really want to know how to use mma operations properly,thanks!