Can `mx` and `my` be used with `sample`?

CUDA-QUANTUM version: 0.8.0 OS: x86 Amazon Linux 2023 2023.5.20240916

I'm trying to translate some simple kernels to OpenQASM2. Sampling a kernel which measures in the Pauli-Z basis works fine:

#include <cudaq.h>

template <std::size_t N>
struct ghz {
  auto operator()() __qpu__ {

    // Compile-time sized array like std::array
    cudaq::qarray<N> q;
    h(q[0]);
    for (int i = 0; i < N - 1; i++) {
      x<cudaq::ctrl>(q[i], q[i + 1]);
    }
    mz(q);
  }
};

int main() {

  auto kernel = ghz<10>{};
  auto counts = cudaq::sample(kernel);

  if (!cudaq::mpi::is_initialized() || cudaq::mpi::rank() == 0) {
    counts.dump();

    // Fine grain access to the bits and counts
    for (auto &[bits, count] : counts) {
      printf("Observed: %s, %lu\n", bits.data(), count);
    }
  }

  return 0;
}

The compile-and-translation pipeline I used was:

cudaq-quake ghz.cpp | cudaq-opt --pass-pipeline="builtin.module(canonicalize,lambda-lifting,apply-op-specialization,func.func(memtoreg{quantum=0}),cc-loop-normalize,cc-loop-unroll)"  | cudaq-translate --convert-to=openqasm2 &> ghz.qasm

However, if I change my kernel to (or my(q) on the final line):

template <std::size_t N>
struct ghz {
  auto operator()() __qpu__ {

    // Compile-time sized array like std::array
    cudaq::qarray<N> q;
    h(q[0]);
    for (int i = 0; i < N - 1; i++) {
      x<cudaq::ctrl>(q[i], q[i + 1]);
    }
    mx(q);
  }
};

I see the following error (from the initial cudaq-quake pass, I think?):

ghz.cpp:11:5: error: no matching function for call to 'mx'
    mx(q);
    ^~
/usr/local/llvm/include/c++/v1/__functional/invoke.h:391:28: note: in instantiation of member function 'ghz<10>::operator()' requested here
_LIBCPP_CONSTEXPR decltype(std::declval<_Fp>()(std::declval<_Args>()...))
                           ^
/usr/local/llvm/include/c++/v1/__functional/invoke.h:401:19: note: while substituting deduced template arguments into function template '__invoke' [with _Fp = ghz<10> &, _Args = <>]
  static decltype(std::__invoke(std::declval<_XFp>(), std::declval<_XArgs>()...)) __try_call(int);
                  ^
/usr/local/llvm/include/c++/v1/__functional/invoke.h:407:28: note: while substituting deduced template arguments into function template '__try_call' [with _XFp = ghz<10> &, _XArgs = (no value)]
  using _Result = decltype(__try_call<_Fp, _Args...>(0));
                           ^
/usr/local/llvm/include/c++/v1/__functional/invoke.h:497:31: note: in instantiation of template class 'std::__invokable_r<void, ghz<10> &>' requested here
    : integral_constant<bool, __invokable<_Fn, _Args...>::value> {};
                              ^
/usr/local/llvm/include/c++/v1/__functional/invoke.h:504:40: note: in instantiation of template class 'std::is_invocable<ghz<10> &>' requested here
inline constexpr bool is_invocable_v = is_invocable<_Fn, _Args...>::value;
                                       ^
/opt/nvidia/cudaq/include/cudaq/concepts.h:21:37: note: (skipping 3 contexts in backtrace; use -ftemplate-backtrace-limit=0 to see all)
concept ValidArgumentsPassed = std::is_invocable_v<QuantumKernel, Args...>;
                                    ^
/opt/nvidia/cudaq/include/cudaq/algorithms/sample.h:31:5: note: while substituting template arguments into constraint expression here
    ValidArgumentsPassed<QuantumKernel, Args...> &&
    ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/opt/nvidia/cudaq/include/cudaq/algorithms/sample.h:203:12: note: while checking the satisfaction of concept 'SampleCallValid<ghz<10> &>' requested here
  requires SampleCallValid<QuantumKernel, Args...>
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/opt/nvidia/cudaq/include/cudaq/algorithms/sample.h:203:12: note: while substituting template arguments into constraint expression here
  requires SampleCallValid<QuantumKernel, Args...>
           ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ghz.cpp:18:17: note: while checking constraint satisfaction for template 'sample<ghz<10> &>' required here
  auto counts = cudaq::sample(kernel);
                ^~~~~
ghz.cpp:18:17: note: in instantiation of function template specialization 'cudaq::sample<ghz<10> &>' requested here
/opt/nvidia/cudaq/include/cudaq/qis/qubit_qis.h:760:23: note: candidate function not viable: no known conversion from 'cudaq::qarray<10UL>' to 'qubit &' (aka 'qudit<2> &') for 1st argument
inline measure_result mx(qubit &q) {
                      ^
error: C++ source has errors. nvq++ cannot proceed.
Aborted (core dumped)

Is there a header import I'm missing? Or another option I should pass to cudaq-quake? Or is this simply not supported yet?

NVIDIA / cuda-quantum

Can `mx` and `my` be used with `sample`? #2219