Dear all,
I recently attended a talk by Johannes Doerfert where he mentioned that some bugs regarding OpenMP are not reported and you are interested in seeing them.
Some time ago I found an issue where the OpenMP runtime allocates a massive amount of memory that I think was not reported yet.
It is triggered by a simple task LU-based algorithm (reproducer found below). This occurs with latest the latest 16.0.6 release, but is also reproducible with previous versions.
I don’t remember the exact details of my former analysis, but it should be roughly like this:
- The dependency structure of task LU causes the dephash entries to
keep references to all depnodes created during the LU algorithm. - Thereby the refcount of the depnodes never becomes 0.
- The issue seems to be caused be the “row/column” blocks/tasks.
- When a block becomes a row/col block, the corresponding dephash entry
gets all depnodes of the trail tasks linked (last_set member) that
are in the same row/column as the block itself.
- When a block becomes a row/col block, the corresponding dephash entry
However, this alone only would account for a fraction of the memory allocated. The other issue is the internal allocator that blows up each allocation. For a 16 byte depnode_list entry something around 400 byte where allocated, if I remember correctly.
When I run the reproducer with 512 blocks and a recent version of clang I see around 60GB used:
./simple-clang 512
# n_blocks: 512
# n_tasks: 44870400
threads: 104 n_blocks: 512 n_tasks: 44870400 hwm diff: 5.890979e+04 MB (beg: 8.56e+01 end: 5.90e+04) rss diff: 5.890979e+04 MB (beg: 8.56e+01 end: 5.90e+04) dur: 1.787654e+02 s
// g++ -fopenmp -o simple-gcc simple.cpp
// clang++ -fopenmp -o simple-clang simple.cpp
//
// run:
// ./tasking [<no. of blocks>]
//
// Mimics task LU algorithm and by default uses 50 blocks that create around
// 42925 tasks.
// With blocks = 300 (9,045,050 tasks) a significant amount of mem. usage
// is visible.
// The no. of task created for blocks NB is NB * (NB + 1) * (2 * NB + 1) / 6.
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <fstream>
#include <sstream>
#include <time.h>
#include <omp.h>
static double get_status_value(const std::string & key_name)
{
std::ifstream file("/proc/self/status");
std::string token;
while (file.good()) {
file >> token;
if (!file.good()) break;
if (token == key_name) {
uint64_t value;
file >> value;
return value;
}
}
return -1.0;
}
static double get_memory_hwm_kb() { return get_status_value("VmHWM:"); }
static double get_memory_rss_kb() { return get_status_value("VmRSS:"); }
#define TASK() task(0)
static void task(long n) {}
int main(int argc, char * argv[])
{
int n_blocks = 50;
if (argc > 1) n_blocks = strtol(argv[1], NULL, 0);
char deps[n_blocks * n_blocks]; (void)deps;
int n_tasks = n_blocks * (n_blocks + 1) * (2 * n_blocks + 1) / 6;
printf("# n_blocks: %d\n", n_blocks);
printf("# n_tasks: %d\n", n_tasks);
// Create threads so their stacks are included when reading HWM.
#pragma omp parallel
omp_get_thread_num();
double duration_s{};
double hwm_begin = get_memory_hwm_kb();
double rss_begin = get_memory_rss_kb();
#pragma omp parallel default(none) \
shared(deps, n_blocks, duration_s, hwm_begin)
{
#pragma omp single
{
double t_start = omp_get_wtime();
long n_tasks_created{}; (void)n_tasks_created;
for (int i = 0; i < n_blocks; ++i) {
// diag block
#pragma omp task depend(inout: deps[i * n_blocks + i])
TASK();
for (int j = i + 1; j < n_blocks; ++j) {
// row block
#pragma omp task depend(in: deps[i * n_blocks + i]) \
depend(inout: deps[i * n_blocks + j])
TASK();
// col block
#pragma omp task depend(in: deps[i * n_blocks + i]) \
depend(inout: deps[j * n_blocks + i])
TASK();
}
for (int j = i + 1; j < n_blocks; ++j) {
for (int k = i + 1; k < n_blocks; ++k) {
// trail block
#pragma omp task depend(in: deps[i * n_blocks + k], deps[j * n_blocks + i]) \
depend(inout: deps[j * n_blocks + k])
TASK();
}
}
}
#pragma omp taskwait
duration_s = omp_get_wtime() - t_start;
}
}
double hwm_end = get_memory_hwm_kb();
double rss_end = get_memory_rss_kb();
printf("threads: %2d n_blocks: %d n_tasks: %d "
"hwm diff: %e MB (beg: %7.2e end: %7.2e) "
"rss diff: %e MB (beg: %7.2e end: %7.2e) "
"dur: %e s\n",
omp_get_max_threads(), n_blocks, n_tasks,
(hwm_end - hwm_begin) / 1e3, hwm_begin / 1e3, hwm_end / 1e3,
(rss_end - rss_begin) / 1e3, rss_begin / 1e3, rss_end / 1e3,
(double)duration_s
);
return 0;
}