LLNL / conduit

Simplified Data Exchange for HPC Simulations
https://software.llnl.gov/conduit/
Other
209 stars 64 forks source link

use cases + strawman interface to raja based host device exec interface #1151

Open cyrush opened 1 year ago

cyrush commented 1 year ago

//------------------------------------------------------
// forall cases
//------------------------------------------------------

//------------------------------------------------------
// run on device
//------------------------------------------------------
{
  DataAccessorHostDevice<float64> acc_src(node["src"]);
  DataAccessorHostDevice<float64> acc_des(node["des"]);

  ExecutionPolicy policy = ExecutionPolicy::device();

  acc_src.use_with(policy);
  acc_des.use_with(policy);

  index_t size = acc_src.number_of_elements();

  forall(policy, 0, size, [=] EXEC_LAMBDA(index_t idx)
  {
     const float64 val = 2.0 * acc_src[idx];
     acc_des.set(idx,val);
  });
  DEVICE_ERROR_CHECK();

  // sync values to node["des"]
  // (no op if node["des"] was originally device memory)
  acc_des.sync(node["des"]); 
}

//------------------------------------------------------
// run on device, 
// result stays on device and is owned by node["des"],
// even if not on the device before hand
//------------------------------------------------------
{
  DataAccessorHostDevice<float64> acc_src(node["src"]);
  DataAccessorHostDevice<float64> acc_des(node["des"]);

  ExecutionPolicy policy = ExecutionPolicy::device();

  acc_src.use_with(policy);
  acc_des.use_with(policy);

  index_t size = acc_src.number_of_elements();

  forall(policy, 0, size, [=] EXEC_LAMBDA(index_t idx)
  {
     const float64 val = 2.0 * acc_src[idx];
     acc_des.set(idx,val);
  });
  DEVICE_ERROR_CHECK();

  // move results to be owned by node["des"]
  // (no op if node["des"] was originally device memory)
  acc_des.move(node["des"]); 
}

//------------------------------------------------------
// run where the src data is
//------------------------------------------------------
{
  DataAccessorHostDevice<float64> acc_src(node["src"]);
  DataAccessorHostDevice<float64> acc_des(node["des"]);

  ExecutionPolicy policy = acc_src.active_space().execution_policy();
  acc_des.use_with(policy);
  acc_des.use_with(policy);

  index_t size = acc_src.number_of_elements();

  forall(policy, 0, size, [=] EXEC_LAMBDA(index_t idx)
  {
    const float64 val = 2.0 * acc_src[idx];
    acc_des.set(idx,val);
  });
  DEVICE_ERROR_CHECK();

  // sync values to node["des"], 
  // (no op if node["des"] was originally in 
  //  same memory space as node["src"] )
  acc_des.sync(node["des"]); 
}

//------------------------------------------------------
// more complex cases
//------------------------------------------------------

//------------------------------------------------------
// complex run on device 
// double lambda forwarding concrete template tag
// for use in lambda
//
// ( requires c++ 20 b/c of templated lambda)
//------------------------------------------------------
{
  DataAccessorHostDevice<float64> acc_src(node["src"]);
  DataAccessorHostDevice<float64> acc_des(node["des"]);

  ExecutionPolicy policy = ExecutionPolicy::device();
  acc_des.use_with(policy);
  acc_des.use_with(policy);

  index_t size = acc_src.number_of_elements();

  index_t min_loc = -1;
  float64 min_val = 0;

  dispatch(policy, [&] <typename Exec>(Exec &exec)
  {
    float64 identity = std::numeric_limits<float64>::max();
    using for_policy    = typename Exec::for_policy;
    using reduce_policy = typename Exec::reduce_policy;

    ReduceMinLoc<reduce_policy,float64> reducer(identity,-1);

    forall<for_policy>(0, size, [=] EXEC_LAMBDA (int i)
    {
       const float64 val = 2.0 * acc_src[idx];
       reducer.minloc(val,i);
       acc_des.set(idx,val);
    });
    DEVICE_ERROR_CHECK();

    min_val = reducer.get();
    min_loc = reducer.getLoc();
  });

  // sync values to node["des"], 
  // (no op if node["des"] was originally in
  //  same memory space as node["src"] )
  acc_des.sync(node["des"]); 
}

//------------------------------------------------------
// complex run on device using functor
// (functor implementation)
//------------------------------------------------------
struct ExecFunctor
{
  float64 min_val;
  index_t min_loc;

  DataAccessorHostDevice<float64> acc_src;
  DataAccessorHostDevice<float64> acc_des;

  template<typename Exec>
  void operator()(Exec &exec)
  {
    float64 identity = std::numeric_limits<float64>::max();
    using for_policy    = typename Exec::for_policy;
    using reduce_policy = typename Exec::reduce_policy;

    ReduceMinLoc<reduce_policy,float64> reducer(identity, -1);

    forall<for_policy>(0, size, [=] (int i)
    {
       const float64 val = 2.0 * acc_src[idx];
       reducer.minloc(val,i);
       acc_des.set(idx,val);
    });
    DEVICE_ERROR_CHECK();

    min_val = reducer.get();
    min_loc = reducer.getLoc();
  }
};

//------------------------------------------------------
// complex run on device using functor 
// (functor dispatch)
//------------------------------------------------------
{
  DataAccessorHostDevice<float64> acc_src(node["src"]);
  DataAccessorHostDevice<float64> acc_des(node["des"]);

  ExecutionPolicy policy = ExecutionPolicy::device();
  acc_des.use_with(policy);
  acc_des.use_with(policy);

  index_t size = acc_src.number_of_elements();

  ExecFunctor f();

  // init functor
  f.acc_src = acc_src;
  f.acc_des = acc_des;

  dispatch(policy,f);

  // get results stored in functor
  float64 min_val = f.min_val;
  index_t min_loc = f.min_loc;

  // sync values to node["des"], 
  // (no op if node["des"] was originally in
  //  same memory space as node["src"])
  acc_des.sync(node["des"]); 
}
cyrush commented 1 year ago

Note: One of the key features here is runtime dispatch mapping to a set of concrete templates w/ tags.

These examples do not show how to implement this, but this is compatible with a code strategy I tested for this.

MrBurmark commented 1 year ago
// c++20 generic lambda
dispatch(policy, [&] <typename Exec>(Exec &exec)
{
  ...
}

// c++14 generic lambda
dispatch(policy, [&] (auto &exec)
{
  using Exec = std::remove_reference_t<decltype(exec)>;
  ...
}
MrBurmark commented 1 year ago
// RAJA current reducer syntax
ReduceMinLoc<reduce_policy,float64> reducer(identity, -1);
forall<for_policy>(0, size,
    [=] (int i)
{
  const float64 val = 2.0 * acc_src[idx];
  reducer.minloc(val,i);
  acc_des.set(idx,val);
});
min_val = reducer.get();
min_loc = reducer.getLoc();

// RAJA new reducer syntax
RAJA::expt::ValLoc<float64> reducer(identity, -1);
forall<for_policy>(0, size,
    RAJA::expt::Reduce<RAJA::operators::minimum>(&reducer),
    [=] (int i, RAJA::expt::ValLoc<float64> &reducer)
{
  const float64 val = 2.0 * acc_src[idx];
  reducer.min(val,i);
  acc_des.set(idx,val);
});
min_val = reducer.getVal();
min_loc = reducer.getLoc();
cyrush commented 1 year ago

@MrBurmark thanks for this info!

JustinPrivitera commented 2 months ago

Notes from discussion:

JustinPrivitera commented 2 months ago

Let's skip move()/replace() for now because the cases are scary.

  1. execution accessor owns the data b/c it was asked to put data on device or on host and it wasn't there originally - easy case
  2. orig node owns the data - scary... maybe just use node move if you want to move
  3. dest ptr is the same as the original - ????
JustinPrivitera commented 1 month ago

sync copies (or no-ops) the data back to where it came from (the node that we wrapped), calling replace will give node the other_ptr call node.reset() on the node where the data came from. Also change execaccessor so that it seamlessly works. It should look like the simple case where the data was already there.

Maybe assume is a better name than replace.