initialize universal vector on the host

We use thrust:universal_vector to store data that might be accessed from the CPU or the GPU. We do the processing on the CPU for smaller buffers, and use the GPU when the buffer is larger. However, when we construct the universal vector, it seems that the constructor will call thrust::uninitialized_fill, which performs the uninitialized fill on the GPU and migrate the memory to the GPU, and cause page faults for subsequent accesses on the CPU. Prefetching can help to mitigate the performance issue a bit, but it is still suboptimal.

I wonder if it is possible to provide some APIs that allows us to construct the universal vector on the CPU, without touching the GPU. Below is an example that shows how much faster we can get with such an API:

#include <chrono>
#include <thrust/execution_policy.h>
#include <thrust/universal_vector.h>

constexpr bool prefetch = false;

int universal(int vec_length) {
  auto t0 = std::chrono::high_resolution_clock::now();
  thrust::universal_vector<int> test_vector(vec_length);
  if (prefetch)
    cudaMemPrefetchAsync(test_vector.data().get(), vec_length * sizeof(int),
                         cudaCpuDeviceId);
  for (int j = 0; j < vec_length; j++)
    test_vector[j] = j + 1;
  thrust::reduce(thrust::host, test_vector.begin(), test_vector.end(), 0,
                 thrust::plus<int>{});
  auto t1 = std::chrono::high_resolution_clock::now();

  return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}

int raw_managed_memory(int vec_length) {
  auto t0 =
      std::chrono::high_resolution_clock::now();
  int *managed_ptr;
  cudaMallocManaged(&managed_ptr, sizeof(int) * vec_length);
  thrust::uninitialized_fill_n(thrust::host, managed_ptr, vec_length, 0);
  for (int j = 0; j < vec_length; j++)
    managed_ptr[j] = j + 1;
  thrust::reduce(thrust::host, managed_ptr, managed_ptr + vec_length, 0,
                 thrust::plus<int>{});
  auto t1 = std::chrono::high_resolution_clock::now();

  return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}

int main() {
  // warm up
  for (int i = 0; i < 10; i++) {
    raw_managed_memory(100);
    universal(100);
  }

  constexpr int repeat = 1000;
  constexpr int vec_length = 10000;

  int results[2] = {0};
  for (int i = 0; i < repeat; i++) {
    results[0] += raw_managed_memory(vec_length);
    results[1] += universal(vec_length);
  }
  for (int &r : results)
    r /= repeat;

  std::cout << "interleaved:" << std::endl;
  std::cout << "raw managed memory: " << results[0] << "us"
            << std::endl;
  std::cout << "universal vector:   " << results[1] << "us"
            << std::endl;

  results[0] = 0;
  results[1] = 0;
  for (int i = 0; i < repeat; i++)
    results[0] += raw_managed_memory(vec_length);
  for (int i = 0; i < repeat; i++)
    results[1] += universal(vec_length);
  for (int &r : results)
    r /= repeat;

  std::cout << std::endl;
  std::cout << "grouped:" << std::endl;
  std::cout << "raw managed memory: " << results[0] << "us"
            << std::endl;
  std::cout << "universal vector:   " << results[1] << "us"
            << std::endl;
  std::cout << std::endl;
}

With prefetching disabled:

interleaved:
raw managed memory: 14us
universal vector:   199us

grouped:
raw managed memory: 14us
universal vector:   70us

With prefetching enabled:

interleaved:
raw managed memory: 8us
universal vector:   63us

grouped:
raw managed memory: 15us
universal vector:   65us

As a workaround one can adapt the uninitialized_allocator from examples/uninitialized_vector.cu to thrust::cuda::univeral_allocator:

#include <chrono>
#include <thrust/execution_policy.h>
#include <thrust/universal_vector.h>
#include <thrust/system/cuda/memory.h>

template<typename T>
  struct uninitialized_allocator
    : thrust::cuda::universal_allocator<T>
{
  __host__
  uninitialized_allocator() {}
  __host__
  uninitialized_allocator(const uninitialized_allocator & other)
    : thrust::cuda::universal_allocator<T>(other) {}
  __host__
  ~uninitialized_allocator() {}

  uninitialized_allocator & operator=(const uninitialized_allocator &) = default;

  template<typename U>
  struct rebind
  {
    typedef uninitialized_allocator<U> other;
  };

  __host__ __device__
  void construct(T *)
  {
    // no-op
  }
};

template <typename T>
using uninitialized_vector = thrust::universal_vector<T, uninitialized_allocator<T>>;

auto universal(int vec_length) {
  auto t0 = std::chrono::steady_clock::now();
  thrust::universal_vector<int> test_vector(vec_length);
  cudaMemPrefetchAsync(test_vector.data().get(), vec_length * sizeof(int),
                       cudaCpuDeviceId);
  for (int j = 0; j < vec_length; j++)
    test_vector[j] = j + 1;
  thrust::reduce(thrust::host, test_vector.begin(), test_vector.end(), 0,
                 thrust::plus<int>{});
  auto t1 = std::chrono::steady_clock::now();

  return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}

auto universal_uninit(int vec_length) {
  auto t0 = std::chrono::steady_clock::now();
  uninitialized_vector<int> test_vector(vec_length);
  thrust::uninitialized_fill(thrust::host, test_vector.begin(), test_vector.end(), 0);
  for (int j = 0; j < vec_length; j++)
    test_vector[j] = j + 1;
  thrust::reduce(thrust::host, test_vector.begin(), test_vector.end(), 0,
                 thrust::plus<int>{});
  auto t1 = std::chrono::steady_clock::now();

  return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}

auto raw_managed_memory(int vec_length) {
  auto t0 =
      std::chrono::steady_clock::now();
  int *managed_ptr;
  cudaMallocManaged(&managed_ptr, sizeof(int) * vec_length);
  thrust::uninitialized_fill_n(thrust::host, managed_ptr, vec_length, 0);
  for (int j = 0; j < vec_length; j++)
    managed_ptr[j] = j + 1;
  thrust::reduce(thrust::host, managed_ptr, managed_ptr + vec_length, 0,
                 thrust::plus<int>{});
  auto t1 = std::chrono::steady_clock::now();

  return std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count();
}

int main() {
  // warm up
  for (int i = 0; i < 10; i++) {
    raw_managed_memory(100);
    universal(100);
  }

  constexpr int repeat = 1000;
  constexpr int vec_length = 10000;

  {
    std::chrono::microseconds::rep results[2] = {0};
    for (int i = 0; i < repeat; i++) {
      results[0] += raw_managed_memory(vec_length);
      results[1] += universal(vec_length);
    }
    for (auto &r : results)
      r /= repeat;

    std::cout << "interleaved:\n"
                 "raw managed memory:             " << results[0] << "us\n"
                 "universal vector w/ prefetch:   " << results[1] << "us\n\n";
  }

  {
    std::chrono::microseconds::rep results[2] = {0};
    for (int i = 0; i < repeat; i++)
      results[0] += raw_managed_memory(vec_length);
    for (int i = 0; i < repeat; i++)
      results[1] += universal(vec_length);
    for (auto &r : results)
      r /= repeat;

    std::cout << "grouped:\n"
                 "raw managed memory:             " << results[0] << "us\n"
                 "universal vector w/ prefetch:   " << results[1] << "us\n\n";
  }

  // warm up
  for (int i = 0; i < 10; i++) {
    raw_managed_memory(100);
    universal_uninit(100);
  }

  {
    std::chrono::microseconds::rep results[2] = {0};
    for (int i = 0; i < repeat; i++) {
      results[0] += raw_managed_memory(vec_length);
      results[1] += universal_uninit(vec_length);
    }
    for (auto &r : results)
      r /= repeat;

    std::cout << "interleaved:\n"
                 "raw managed memory:                     " << results[0] << "us\n"
                 "universal uninit vector w/o prefetch:   " << results[1] << "us\n\n";
  }

  {
    std::chrono::microseconds::rep results[2] = {0};
    for (int i = 0; i < repeat; i++)
      results[0] += raw_managed_memory(vec_length);
    for (int i = 0; i < repeat; i++)
      results[1] += universal_uninit(vec_length);
    for (auto &r : results)
      r /= repeat;

    std::cout << "grouped:\n"
                 "raw managed memory:                     " << results[0] << "us\n"
                 "universal uninit vector w/o prefetch:   " << results[1] << "us\n\n";
  }
}

For me this gives

interleaved:
raw managed memory:             10us
universal vector w/ prefetch:   155us

grouped:
raw managed memory:             30us
universal vector w/ prefetch:   202us

interleaved:
raw managed memory:                     10us
universal uninit vector w/o prefetch:   37us

grouped:
raw managed memory:                     30us
universal uninit vector w/o prefetch:   14us

NVIDIA / cccl

initialize universal vector on the host #810