The GPU dialect says:
GPU functions are either kernels (as indicated by the
kernel
attribute) or regular functions. The former can be launched from the host side, while the latter are device side only.
However, it does not show examples of calling non-kernel gpu.func
and I fail to run the following:
module attributes {gpu.container_module} {
gpu.module @kernels {
gpu.func @simple1(
%r_size : index,
%in_a : memref<?xi64>,
%in_b : memref<?xi64>,
%in_c : memref<?xi64>
) {
%ci0 = arith.constant 0 : index
%ci1 = arith.constant 1 : index
scf.for %idx0 = %ci0 to %r_size step %ci1 {
%idx0_i64 = arith.index_cast %idx0 : index to i64
memref.store %idx0_i64, %in_a[%idx0] : memref<?xi64>
}
gpu.return
}
gpu.func @simple(
%r_size : index,
%in_a : memref<?xi64>,
%in_b : memref<?xi64>,
%in_c : memref<?xi64>
) kernel {
call @simple1(%r_size, %in_a, %in_b, %in_c)
gpu.return
}
}
func.func @main() -> i64 {
// Constants
%ci1 = arith.constant 1 : index
%c0 = arith.constant 0 : i64
%size = arith.constant 100 : index
%a = memref.alloc(%size) : memref<?xi64>
%b = memref.alloc(%size) : memref<?xi64>
%c = memref.alloc(%size) : memref<?xi64>
%a_unranked = memref.cast %a : memref<?xi64> to memref<*xi64>
%b_unranked = memref.cast %b : memref<?xi64> to memref<*xi64>
%c_unranked = memref.cast %c : memref<?xi64> to memref<*xi64>
gpu.host_register %a_unranked : memref<*xi64>
gpu.host_register %b_unranked : memref<*xi64>
gpu.host_register %c_unranked : memref<*xi64>
%tmp_a = gpu.alloc(%size) : memref<?xi64>
%tmp_b = gpu.alloc(%size) : memref<?xi64>
%tmp_c = gpu.alloc(%size) : memref<?xi64>
%token_a = gpu.memcpy async %tmp_a, %a : memref<?xi64>, memref<?xi64>
%token_b = gpu.memcpy async [%token_a] %tmp_b, %b : memref<?xi64>, memref<?xi64>
%token_c = gpu.memcpy async [%token_b] %tmp_c, %c : memref<?xi64>, memref<?xi64>
%token_d = gpu.launch_func async [%token_c] @kernels::@simple blocks in (%ci1, %ci1, %ci1) threads in (%ci1, %ci1, %ci1)
args(%size : index, %tmp_a : memref<?xi64>,%tmp_b : memref<?xi64>, %tmp_c : memref<?xi64>)
%token_e = gpu.memcpy async %a, %tmp_a : memref<?xi64>, memref<?xi64>
call @printMemrefI64(%a_unranked) : (memref<*xi64>) -> ()
return %c0 : i64
}
func.func private @printI64(i64)
func.func private @printMemrefI64(memref<*xi64>)
} // END gpu.container_module
Hence the question, what is the correct way to call a “device function” in gpu
dialect from within a kernel?