Target CUDA RTL --> The primary context is inactive, set its flags to CU_CTX_SCHED_BLOCKING_SYNC

I see often when executing my work-in-the-progress offloading app on X86
with an older NVIDIA GPU (sm_35). Can someone enlighten me on this so I
can solve it quickly?

Thanks,

and when this happens, no signal can get caught immediately by the system.

(gdb) where
#0 0x00002aaaaaacd6c2 in clock_gettime ()
#1 0x00002aaaabd347fd in clock_gettime () from /usr/lib64/libc.so.6
#2 0x00002aaaac98737e in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#3 0x00002aaaaca4b4f7 in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#4 0x00002aaaac88140a in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#5 0x00002aaaac92afbe in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#6 0x00002aaaac92d0d7 in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#7 0x00002aaaac857719 in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#8 0x00002aaaac9c915e in cuDevicePrimaryCtxRetain ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#9 0x00002aaaac523757 in __tgt_rtl_init_device ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.rtl.cuda.so
#10 0x00002aaaaaca28bb in DeviceTy::init() ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#11 0x00002aaaac297348 in std::__1::__call_once(unsigned long
volatile&, void*, void (*)(void*)) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++.so.1
#12 0x00002aaaaaca2d88 in device_is_ready(int) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#13 0x00002aaaaacaf296 in CheckDeviceAndCtors(long) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#14 0x00002aaaaaca5ead in __tgt_target_data_begin_mapper ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#15 0x00002aaaab3a4958 in nest::SimulationManager::initialize() (this=0x5d3480)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nestkernel/simulation_manager.cpp:76
#16 0x00002aaaab39cbb9 in nest::KernelManager::initialize() (this=0x5d3380)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nestkernel/kernel_manager.cpp:88
#17 0x0000000000405769 in neststartup(int*, char***, SLIInterpreter&) (
    argc=argc@entry=0x7ffffffee554, argv=argv@entry=0x7ffffffee558, engine=...)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nest/neststartup.cpp:87
#18 0x0000000000405650 in main (argc=<optimized out>, argv=<optimized out>)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nest/main.cpp:42

I obtained a desired result (a crash) without a Spack environment.

No, I take that back. Here's the backtrace:

(gdb) where
#0 0x00002aaaaaacd6c2 in clock_gettime ()
#1 0x00002aaaabd167fd in clock_gettime () from /usr/lib64/libc.so.6
#2 0x00002aaaac97837e in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#3 0x00002aaaaca3c4f7 in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#4 0x00002aaaac87240a in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#5 0x00002aaaac91bfbe in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#6 0x00002aaaac91e0d7 in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#7 0x00002aaaac848719 in ?? ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#8 0x00002aaaac9ba15e in cuDevicePrimaryCtxRetain ()
   from /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
#9 0x00002aaaac514757 in __tgt_rtl_init_device ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.rtl.cuda.so
#10 0x00002aaaab9b88bb in DeviceTy::init() ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#11 0x00002aaaac279348 in std::__1::__call_once(unsigned long
volatile&, void*, void (*)(void*)) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++.so.1
#12 0x00002aaaab9b8d88 in device_is_ready(int) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#13 0x00002aaaab9c5296 in CheckDeviceAndCtors(long) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#14 0x00002aaaab9bbead in __tgt_target_data_begin_mapper ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#15 0x00002aaaaabfaa58 in nest::SimulationManager::initialize() (this=0x5d3290)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nestkernel/simulation_manager.cpp:76
#16 0x00002aaaaabf2c69 in nest::KernelManager::initialize() (this=0x5d3190)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nestkernel/kernel_manager.cpp:88
#17 0x0000000000405769 in neststartup(int*, char***, SLIInterpreter&) (
    argc=argc@entry=0x7fffffff0a84, argv=argv@entry=0x7fffffff0a88, engine=...)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nest/neststartup.cpp:87
#18 0x0000000000405650 in main (argc=<optimized out>, argv=<optimized out>)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nest/main.cpp:42

This happens an unpredictable way even though I launch the app the same way.

Could you provide

which nvcc
nvcc --version
ldd /p/project/cjzam11/kitayama1/opt/clang/current/lib/[libomptarget.rtl.cuda.so](http://libomptarget.rtl.cuda.so)
and nvidia-smi output?
Ye

$ which nvcc
/usr/local/software/jureca/Stages/2019a/software/CUDA/10.1.105/bin/nvcc
[kitayama1@jrc0004 kitayama1]$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Fri_Feb__8_19:08:17_PST_2019
Cuda compilation tools, release 10.1, V10.1.105
[kitayama1@jrc0004 kitayama1]$ ldd
/p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.rtl.cuda.so
linux-vdso.so.1 => (0x00007ffc2a767000)
libcuda.so.1 =>
/usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1
(0x00002ac5418b9000)
libelf.so.1 => /usr/lib64/libelf.so.1 (0x00002ac542aa1000)
libc++.so.1 => /p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++.so.1
(0x00002ac5416d7000)
libc++abi.so.1 =>
/p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++abi.so.1
(0x00002ac5417a0000)
libm.so.6 => /usr/lib64/libm.so.6 (0x00002ac542cb9000)
libgcc_s.so.1 =>
/usr/local/software/jureca/Stages/2019a/software/GCCcore/8.3.0/lib64/libgcc_s.so.1
(0x00002ac5417e2000)
libc.so.6 => /usr/lib64/libc.so.6 (0x00002ac542fbb000)
libdl.so.2 => /usr/lib64/libdl.so.2 (0x00002ac543389000)
libpthread.so.0 => /usr/lib64/libpthread.so.0 (0x00002ac54358d000)
librt.so.1 => /usr/lib64/librt.so.1 (0x00002ac5437a9000)
libz.so.1 => /usr/local/software/jureca/Stages/2019a/software/zlib/1.2.11-GCCcore-8.3.0/lib/libz.so.1
(0x00002ac5417fd000)
/lib64/ld-linux-x86-64.so.2 (0x00002ac541695000)
libatomic.so.1 =>
/usr/local/software/jureca/Stages/2019a/software/GCCcore/8.3.0/lib64/libatomic.so.1
(0x00002ac541816000)
[kitayama1@jrc0004 kitayama1]$ nvidia-smi
Tue Sep 29 01:21:23 2020

Still not clear what went wrong. I just installed clang 11 release on my local cluster with sm_35. No issues show up.
Is this issue exposed from a complicated app or even just a simple “omp target” hangs the code? Are you able to run any CUDA program?
The call stack indicates cuDevicePrimaryCtxRetain tries to interact with the driver but it doesn’t respond and keeps the host side waiting.
Still not clear if /usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1 is consistent with the running linux kernel driver.
Could you try asking Julich to see if they have any clue about their settings on the machine.

Ye

Ye,
When it comes to OpenMP offloading, I think I'm on my own.

Still, I don't understand the issue fully as to under what conditions
the app hangs, but one workaround I did is on the computer node with a
GPU, I
loaded only the minimalist system provides modules; then it ran
through the end. The hangs happen at a very early stage.

Ye,
Do you use Environment modules as your package manager? What env vars
do you set when building Clang and running the app?

My cluster doesn’t have a module system. The libcuda comes from

/lib64/libcuda.so.1
On my desktop, it comes from

/usr/lib/x86_64-linux-gnu/libcuda.so.1
Usually libcuda.so is installed as the part of the NVIDIA driver and the admins don’t mess with them.

That is why I feel your libcuda.so suspicious.

/usr/local/software/jureca/Stages/2019a/software/nvidia/driver/lib64/libcuda.so.1

Were you saying with minimal modules, your app runs till the end without any issue but with more modules added, your app hangs?
If it is the case, then it is highly possible caused by one of the dynamic libraries.
Please compare

ldd your_app

ldd /p/project/cjzam11/kitayama1/opt/clang/current/lib/[libomptarget.rtl.cuda.so](http://libomptarget.rtl.cuda.so)
in both module settings.

Ye

I ran my app with the minimal system provided modules only, but the app hung up.
so here are the ldd results:

[kitayama1@jrc0002 kitayama1]$ ldd opt/nest-clang-offload/bin/nest
linux-vdso.so.1 => (0x00007ffc41459000)
libsli_readline.so =>
/p/project/cjzam11/kitayama1/opt/nest-clang-offload/bin/../lib64/nest/libsli_readline.so
(0x00002b2aa478d000)
libmodels.so =>
/p/project/cjzam11/kitayama1/opt/nest-clang-offload/bin/../lib64/nest/libmodels.so
(0x00002b2aa498d000)
libnestkernel.so =>
/p/project/cjzam11/kitayama1/opt/nest-clang-offload/bin/../lib64/nest/libnestkernel.so
(0x00002b2aa4f13000)
librandom.so =>
/p/project/cjzam11/kitayama1/opt/nest-clang-offload/bin/../lib64/nest/librandom.so
(0x00002b2aa479b000)
libsli.so => /p/project/cjzam11/kitayama1/opt/nest-clang-offload/bin/../lib64/nest/libsli.so
(0x00002b2aa47f5000)
libnestutil.so =>
/p/project/cjzam11/kitayama1/opt/nest-clang-offload/bin/../lib64/nest/libnestutil.so
(0x00002b2aa4928000)
libstdc++.so.6 =>
/usr/local/software/jureca/Stages/2019a/software/GCCcore/8.3.0/lib64/libstdc++.so.6
(0x00002b2aa5108000)
libm.so.6 => /usr/lib64/libm.so.6 (0x00002b2aa52a2000)
libomp.so => /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomp.so
(0x00002b2aa55a4000)
libomptarget.so =>
/p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
(0x00002b2aa4948000)
libgcc_s.so.1 =>
/usr/local/software/jureca/Stages/2019a/software/GCCcore/8.3.0/lib64/libgcc_s.so.1
(0x00002b2aa4969000)
libpthread.so.0 => /usr/lib64/libpthread.so.0 (0x00002b2aa5673000)
libc.so.6 => /usr/lib64/libc.so.6 (0x00002b2aa588f000)
/lib64/ld-linux-x86-64.so.2 (0x00002b2aa4769000)
libdl.so.2 => /usr/lib64/libdl.so.2 (0x00002b2aa5c5d000)
libc++.so.1 => /p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++.so.1
(0x00002b2aa5e61000)
libc++abi.so.1 =>
/p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++abi.so.1
(0x00002b2aa5f2a000)
librt.so.1 => /usr/lib64/librt.so.1 (0x00002b2aa5f6b000)
libatomic.so.1 =>
/usr/local/software/jureca/Stages/2019a/software/GCCcore/8.3.0/lib64/libatomic.so.1
(0x00002b2aa6173000)
[kitayama1@jrc0002 kitayama1]$ ldd
//p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.rtl.cuda.so
linux-vdso.so.1 => (0x00007ffff4d5d000)
libcuda.so.1 => /usr/lib64/libcuda.so.1 (0x00002af63ec29000)
libelf.so.1 => /usr/lib64/libelf.so.1 (0x00002af63fe11000)
libc++.so.1 => /p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++.so.1
(0x00002af63ea45000)
libc++abi.so.1 =>
/p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++abi.so.1
(0x00002af63eb0e000)
libm.so.6 => /usr/lib64/libm.so.6 (0x00002af640029000)
libgcc_s.so.1 =>
/usr/local/software/jureca/Stages/2019a/software/GCCcore/8.3.0/lib64/libgcc_s.so.1
(0x00002af63eb50000)
libc.so.6 => /usr/lib64/libc.so.6 (0x00002af64032b000)
libdl.so.2 => /usr/lib64/libdl.so.2 (0x00002af6406f9000)
libpthread.so.0 => /usr/lib64/libpthread.so.0 (0x00002af6408fd000)
librt.so.1 => /usr/lib64/librt.so.1 (0x00002af640b19000)
libz.so.1 => /usr/lib64/libz.so.1 (0x00002af640d21000)
/lib64/ld-linux-x86-64.so.2 (0x00002af63ea05000)
libatomic.so.1 =>
/usr/local/software/jureca/Stages/2019a/software/GCCcore/8.3.0/lib64/libatomic.so.1
(0x00002af63eb6b000)

  1. I’m lost. You said “but one workaround I did is on the computer node with a GPU, I loaded only the minimalist system provides modules; then it ran through the end”. This time you said it hangs. Does it hang or not? Or it occasionally hangs. When it hangs please provide the back trace in this minimal module environment.

  2. I saw “libcuda.so.1 => /usr/lib64/libcuda.so.1 (0x00002af63ec29000)” in this minimal module environment. When your app hangs in 1, could you check if the libcuda picked up is the same one as ldd reported?

  3. As I mentioned earlier, could you test a simple omp offload program?
    minimal.cpp

int main()
{
#pragma omp target
{
}
return 0;

}

This is a backtrace taken when the app executed with minimal modules
loaded and hung.

#0 0x00002aaaaaacd6c2 in clock_gettime ()
#1 0x00002aaaabce57fd in clock_gettime () from /usr/lib64/libc.so.6
#2 0x00002aaaacd4c37e in ?? () from /usr/lib64/libcuda.so.1
#3 0x00002aaaace104f7 in ?? () from /usr/lib64/libcuda.so.1
#4 0x00002aaaacc4640a in ?? () from /usr/lib64/libcuda.so.1
#5 0x00002aaaacceffbe in ?? () from /usr/lib64/libcuda.so.1
#6 0x00002aaaaccf20d7 in ?? () from /usr/lib64/libcuda.so.1
#7 0x00002aaaacc1c719 in ?? () from /usr/lib64/libcuda.so.1
#8 0x00002aaaacd8e15e in cuDevicePrimaryCtxRetain ()
   from /usr/lib64/libcuda.so.1
#9 0x00002aaaac4d5377 in __tgt_rtl_init_device ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.rtl.cuda.so
#10 0x00002aaaaac996fb in DeviceTy::init() ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#11 0x00002aaaac246238 in std::__1::__call_once(unsigned long
volatile&, void*, void (*)(void*)) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libc++.so.1
#12 0x00002aaaaac99bd3 in device_is_ready(int) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#13 0x00002aaaaaca5c36 in CheckDeviceAndCtors(long) ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#14 0x00002aaaaac9c86f in __tgt_target_data_begin_mapper ()
   from /p/project/cjzam11/kitayama1/opt/clang/current/lib/libomptarget.so
#15 0x00002aaaab373cd8 in nest::SimulationManager::initialize() (this=0x5d2cd0)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nestkernel/simulation_manager.cpp:76
#16 0x00002aaaab36c2d9 in nest::KernelManager::initialize() (this=0x5d2bd0)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nestkernel/kernel_manager.cpp:88
#17 0x0000000000405739 in neststartup(int*, char***, SLIInterpreter&) (
    argc=argc@entry=0x7fffffffbb64, argv=argv@entry=0x7fffffffbb68, engine=...)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nest/neststartup.cpp:87
#18 0x0000000000405640 in main (argc=<optimized out>, argv=<optimized out>)
    at /p/project/cjzam11/kitayama1/projects/nest-simulator/nest/main.cpp:42