The MLIR and LLVM toolchain was built with [AMDGPU][True16] Don't use the VGPR_LO/HI16 register classes. (#76440) · llvm/llvm-project@8c6172b · GitHub. Use mlir-translate --mlir-to-llvmir| clang++ -c -x ir -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx1103
to generate static lib
module attributes {llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.target = #omp.target<target_cpu = "gfx1103", target_features = "">} {
llvm.func @llvm_omp_target_alloc_device(i64, i32) -> !llvm.ptr
llvm.func @omp_get_default_device() -> i32
llvm.func @_QQmain_omp_outline_1() attributes {omp.declare_target = #omp.declaretarget<device_type = (nohost)>} {
%0 = llvm.mlir.zero : !llvm.ptr
%1 = llvm.call @omp_get_default_device() : () -> i32
%2 = llvm.getelementptr %0[67108864] : (!llvm.ptr) -> !llvm.ptr, f64
%3 = llvm.ptrtoint %2 : !llvm.ptr to i64
%4 = llvm.call @llvm_omp_target_alloc_device(%3, %1) : (i64, i32) -> !llvm.ptr
%5 = llvm.call @llvm_omp_target_alloc_device(%3, %1) : (i64, i32) -> !llvm.ptr
%6 = omp.map_info var_ptr(%4 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByCopy) -> !llvm.ptr
%7 = omp.map_info var_ptr(%5 : !llvm.ptr, f64) map_clauses(tofrom) capture(ByCopy) -> !llvm.ptr
omp.target map_entries(%6 -> %arg0, %7 -> %arg1 : !llvm.ptr, !llvm.ptr) {
^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
%8 = llvm.mlir.constant(0 : index) : i64
%9 = llvm.mlir.constant(1 : index) : i64
%10 = llvm.mlir.constant(8192 : index) : i64
omp.teams {
omp.parallel {
omp.wsloop for (%arg2, %arg3, %arg4, %arg5) : i64 = (%8, %8, %8, %8) to (%10, %10, %10, %10) step (%9, %9, %9, %9) {
%11 = llvm.mul %arg2, %10 : i64
%12 = llvm.add %11, %arg3 : i64
%13 = llvm.load %arg0 {alignment = 8 : i64} : !llvm.ptr -> vector<16xf64>
%14 = llvm.load %arg1 {alignment = 8 : i64} : !llvm.ptr -> vector<16xf64>
%15 = llvm.mul %arg3, %10 : i64
%16 = llvm.add %15, %arg2 : i64
%17 = llvm.getelementptr %arg1[%16] : (!llvm.ptr, i64) -> !llvm.ptr, f64
%18 = llvm.load %17 {alignment = 8 : i64} : !llvm.ptr -> vector<16xf64>
%19 = llvm.fmul %13, %14 : vector<16xf64>
%20 = llvm.fdiv %14, %18 : vector<16xf64>
%21 = llvm.fadd %19, %20 : vector<16xf64>
%22 = llvm.getelementptr %arg1[%12] : (!llvm.ptr, i64) -> !llvm.ptr, f64
llvm.store %21, %22 {alignment = 8 : i64} : vector<16xf64>, !llvm.ptr
omp.terminator
}
omp.terminator
}
omp.terminator
}
omp.terminator
}
omp.barrier
llvm.return
}
llvm.func @_mlir_ciface__QQmain_omp_outline_1() attributes {llvm.emit_c_interface} {
llvm.call @_QQmain_omp_outline_1() : () -> ()
llvm.return
}
}
Using C program to call the function and build with clang args -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx1103
it’s normal when running on the CPU host, but on AMDGPU it raise errors even if replace llvm_omp_target_alloc_device
to llvm.alloc
and with a smaller size:
Libomptarget error: Host ptr 0x0000560ed81500a1 does not have a matching target pointer.
Libomptarget error: Consult https://openmp.llvm.org/design/Runtimes.html for debugging options.
Libomptarget error: Source location information not present. Compile with -g or -gline-tables-only.
Libomptarget fatal error 1: failure of target construct while offloading is mandatory
The C code has similar function works normally on the AMDGPU:
int main() {
double* a = (double*)llvm_omp_target_alloc_device(4096*4096*sizeof(double) , omp_get_default_device());
double* b = (double*)llvm_omp_target_alloc_device(4096*4096*sizeof(double) , omp_get_default_device());
#pragma omp target teams map(tofrom: a, b)
#pragma omp parallel for
for(int i=0; i<4096; i++){
for(int j=0; j<4096; j++){
for (int k=0; k<4096; k++){
a[i*4096+j] = i * j;
b[j*4096+k] = j / i;
}
}
}
}