Thank you for your response!
I referred to some examples in the test
directory and attempted to execute a test containing matmul. However, I am currently encountering the following error, and I am unsure where the issue lies.
'cuStreamSynchronize(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuStreamDestroy(stream)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
'cuModuleUnload(module)' failed with 'CUDA_ERROR_ILLEGAL_ADDRESS'
The IR I tested is as follows:
#map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>
#map1 = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>
module attributes {gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu", "onnx-mlir.symbol-postfix" = "matmul_model"} {
memref.global "private" constant @__constant_4x5xf32_0 : memref<4x5xf32> = dense<[
[1.0, 2.0, 3.0, 4.0, 5.0],
[6.0, 7.0, 8.0, 9.0, 10.0],
[11.0, 12.0, 13.0, 14.0, 15.0],
[16.0, 17.0, 18.0, 19.0, 20.0]
]>
memref.global "private" constant @__constant_5x3xf32_0 : memref<5x3xf32> = dense<[
[1.0, 2.0, 3.0],
[4.0, 5.0, 6.0],
[7.0, 8.0, 9.0],
[10.0, 11.0, 12.0],
[13.0, 14.0, 15.0]
]>
func.func @main() {
%0 = memref.get_global @__constant_4x5xf32_0 : memref<4x5xf32>
%1 = memref.get_global @__constant_5x3xf32_0 : memref<5x3xf32>
%2 = call @main_graph(%0, %1) : (memref<4x5xf32>, memref<5x3xf32>) -> memref<4x3xf32>
%cast = memref.cast %2 : memref<4x3xf32> to memref<?x?xf32>
%cast1 = memref.cast %cast : memref<?x?xf32> to memref<*xf32>
call @printMemrefF32(%cast1) : (memref<*xf32>) -> ()
return
}
func.func private @printMemrefF32(memref<*xf32>)
func.func @main_graph(%arg0: memref<4x5xf32> {onnx.name = "x"}, %arg1: memref<5x3xf32> {onnx.name = "y"}) -> (memref<4x3xf32> {onnx.name = "output"}) attributes {llvm.emit_c_interface} {
%alloc = memref.alloc() {alignment = 16 : i64} : memref<4x3xf32>
%cst = arith.constant 0.000000e+00 : f32
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
%0 = affine.apply #map(%c4)[%c0, %c1]
%c3 = arith.constant 3 : index
%1 = affine.apply #map(%c3)[%c0, %c1]
gpu.launch_func @main_graph_kernel::@main_graph_kernel blocks in (%0, %c1, %c1) threads in (%1, %c1, %c1) args(%c1 : index, %c0 : index, %arg0 : memref<4x5xf32>, %arg1 : memref<5x3xf32>, %cst : f32, %alloc : memref<4x3xf32>)
return %alloc : memref<4x3xf32>
}
gpu.module @main_graph_kernel {
gpu.func @main_graph_kernel(%arg0: index, %arg1: index, %arg2: memref<4x5xf32>, %arg3: memref<5x3xf32>, %arg4: f32, %arg5: memref<4x3xf32>) kernel {
%block_id_x = gpu.block_id x
%thread_id_x = gpu.thread_id x
%0 = affine.apply #map1(%block_id_x)[%arg0, %arg1]
%1 = affine.apply #map1(%thread_id_x)[%arg0, %arg1]
%c5 = arith.constant 5 : index
%2 = scf.for %arg6 = %arg1 to %c5 step %arg0 iter_args(%arg7 = %arg4) -> (f32) {
%3 = memref.load %arg2[%0, %arg6] : memref<4x5xf32>
%4 = memref.load %arg3[%arg6, %1] : memref<5x3xf32>
%5 = arith.mulf %3, %4 : f32
%6 = arith.addf %arg7, %5 : f32
scf.yield %6 : f32
}
memref.store %2, %arg5[%0, %1] : memref<4x3xf32>
gpu.return
}
}
}
The main
function and the two tensors were manually added by me, while the rest was automatically generated. Could this error be caused by missing some passes in the lowering pipeline?
Thank you again for your help! 