Handling vectors with omp_target_alloc_shared()

Hi,
My test code below works on host, but not on device, could someone
give me advice if there's a workaround for this?

#include <cstdlib>
#include <new>
#include <limits>
#include <iostream>
#include <vector>
#include <omp.h>
#include <array>
#include <memory_resource>

extern "C" {
  void* llvm_omp_target_alloc_shared(size_t, int);
}

template <class T>
struct Mallocator
{
  typedef T value_type;

  Mallocator () = default;
  template <class U> constexpr Mallocator (const Mallocator <U>&) noexcept {}

  [[nodiscard]] T* allocate(std::size_t n) {
    if (n > std::numeric_limits<std::size_t>::max() / sizeof(T))
      throw std::bad_array_new_length();

    //if (auto p = static_cast<T*>(std::malloc(n*sizeof(T)))) {
    //if (T* p =
static_cast<T*>(llvm_omp_target_alloc_shared(n*sizeof(T), 0))) {
    if (std::uintptr_t p =
(std::uintptr_t)llvm_omp_target_alloc_shared(n*sizeof(T), 0)) {
      //report(p, n);
      return reinterpret_cast<T*>(p);
    }

    throw std::bad_alloc();
  }

  void deallocate(T* p, std::size_t n) noexcept {
    //report(p, n, 0);
    //std::free(p);
    //omp_target_free(p, 0);
  }

private:
  void report(T* p, std::size_t n, bool alloc = true) const {
    std::cout << (alloc ? "Alloc: " : "Dealloc: ") << sizeof(T)*n
      << " bytes at " << std::hex << std::showbase
      << reinterpret_cast<void*>(p) << std::dec << '\n';
  }
};

template <class T, class U>
bool operator==(const Mallocator <T>&, const Mallocator <U>&) { return true; }
template <class T, class U>
bool operator!=(const Mallocator <T>&, const Mallocator <U>&) { return false; }

//typedef std::vector<int, Mallocator<int> > vector<int>;
class A {
public:
     A();
     double get() {
       return this->v_[0][0];
     };
     int f() { return reinterpret_cast<int*>(array_)[0]; };

private:
   std::vector<std::array<double,3>, Mallocator< std::array<double, 3> > > v_;
   //std::vector<double, Mallocator< double> > v_;
   std::uintptr_t array_;
};

  v_.reserve(1024);
  for (int i=0;i<1024;i++)
     v_[i].fill(123.4);

}

int main()
{
  A *p = (A*)llvm_omp_target_alloc_shared(sizeof(A), 0);
  new (p) A();

  std::vector<std::array<double, 3>, Mallocator<std::array<double,
3>>> myv(1024);
  myv[0].fill(0.256);
#pragma omp target parallel for
  for (int i=0;i < 1024;i++) {
   printf("%f\n", p->get());
   //printf("%f\n", myv[0][0]);
  }

}

You need to specify is_device_ptr​ in your region so the runtime knows to just copy the pointer into the device. Otherwise, it will assume it is on the host and try to look it up in the device mapping table and get nothing. You can use env LIBOMPTARGET_INFO=-1​ to see the difference in the assumed mapping type.

Joseph

I’d like to avoid using the is_device_pre clause as my app deals with a neural network consisting of tens of thousands of neurons.

Itaru.

The pointer value should be copied to the kernel by value, I wouldn’t think it would have a large amount of overhead. I don’t think there’s any other way to tell the runtime to use a device pointer instead of trying to create one using the mapping table. You could try using a single large pointer to a memory region and extracting it from there.

This worked for me:

#include <cstdlib>
#include <new>
#include <limits>
#include <iostream>
#include <vector>
#include <omp.h>
#include <array>
#include <memory_resource>

extern "C" {
  void* llvm_omp_target_alloc_shared(size_t, int);
}

template <class T>
struct Mallocator
{
  typedef T value_type;

  Mallocator () = default;
  template <class U> constexpr Mallocator (const Mallocator <U>&) noexcept {}

  [[nodiscard]] T* allocate(std::size_t n) {
    if (n > std::numeric_limits<std::size_t>::max() / sizeof(T))
      throw std::bad_array_new_length();

    //if (auto p = static_cast<T*>(std::malloc(n*sizeof(T)))) {
    //if (T* p =
static_cast<T*>(llvm_omp_target_alloc_shared(n*sizeof(T), 0))) {
    if (std::uintptr_t p =
(std::uintptr_t)llvm_omp_target_alloc_shared(n*sizeof(T), 0)) {
      //report(p, n);
      return reinterpret_cast<T*>(p);
    }

    throw std::bad_alloc();
  }

  void deallocate(T* p, std::size_t n) noexcept {
    //report(p, n, 0);
    //std::free(p);
    //omp_target_free(p, 0);
  }

private:
  void report(T* p, std::size_t n, bool alloc = true) const {
    std::cout << (alloc ? "Alloc: " : "Dealloc: ") << sizeof(T)*n
      << " bytes at " << std::hex << std::showbase
      << reinterpret_cast<void*>(p) << std::dec << '\n';
  }
};

template <class T, class U>
bool operator==(const Mallocator <T>&, const Mallocator <U>&) { return true; }
template <class T, class U>
bool operator!=(const Mallocator <T>&, const Mallocator <U>&) { return false; }

//typedef std::vector<int, Mallocator<int> > vector<int>;
class A {
public:
     A();
     double get() {
       return v_[0][0];
     };

private:
   //std::vector<std::array<double,3>, Mallocator< std::array<double,
3> > > v_ = std::vector<std::array<double, 3>,
Mallocator<std::array<double, 3> > >(1024);
   std::vector<std::array<double,3>, Mallocator< std::array<double, 3> > > v_;
};

  v_.reserve(1024);
  for (int i=0;i<1024;i++)
     v_[i].fill(123.4);

}

int main()
{
  std:uintptr_t p = (std::uintptr_t)llvm_omp_target_alloc_shared(sizeof(A), 0);
  new (reinterpret_cast<A*>(p)) A();
#pragma omp target parallel for
  for (int i=0;i < 1024;i++) {
   printf("%f\n", reinterpret_cast<A*>(p)->get());
  }

}

This certainly bypasses is_device_ptr​, but it’s the same operation fundamentally. If you use LIBOMPTARGET_INFO​=1 you can see that in both cases a 64-bit variable is being copied by-value to the device.