Unable to compile cumm (fails with "error: ambiguous overload for ‘operator=’")

I'm on Ubuntu 22.04 in WSL and trying to install cumm in a conda environment. Building the wheels fails with the attached error log.
log.txt
The text of the error log is too long to paste into the issue window. Here's a snippet from the full log of what I believe is the main error and the stacktrace
Building wheels for collected packages: cumm-cu120
  Building wheel for cumm-cu120 (pyproject.toml): started
  Building wheel for cumm-cu120 (pyproject.toml): finished with status 'error'
  error: subprocess-exited-with-error

  × Building wheel for cumm-cu120 (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [675 lines of output]
      running bdist_wheel
      running build
      running build_py
      copying cumm/__version__.py -> build/lib.linux-x86_64-cpython-311/cumm
      running egg_info
      writing cumm_cu120.egg-info/PKG-INFO
      writing dependency_links to cumm_cu120.egg-info/dependency_links.txt
      writing requirements to cumm_cu120.egg-info/requires.txt
      writing top-level names to cumm_cu120.egg-info/top_level.txt
      reading manifest file 'cumm_cu120.egg-info/SOURCES.txt'
      reading manifest template 'MANIFEST.in'
      adding license file 'LICENSE'
      writing manifest file 'cumm_cu120.egg-info/SOURCES.txt'
      running build_ext
      [1/39] [GCC][c++]/home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/core_cc_pybind_main/core_cc_pybind_main.cc.o
      [2/39] [GCC][c++]/home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc.o
      FAILED: /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc.o
      g++ -MMD -MT /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc.o -MF /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc.o.d -I "/home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include" -I "/tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include" -I "/home/matt/miniconda3/envs/test/include/python3.11" -I "/usr/local/cuda/include" -I "/home/matt/Code/cumm/include" -O3 -std=c++14 -fPIC -DTV_CUDA -c /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc -o /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc.o
      In file included from /usr/include/c++/11/bits/forward_list.h:38,
                       from /usr/include/c++/11/forward_list:38,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/../detail/common.h:306,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/../attr.h:13,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h:12,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/pybind11.h:13,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/stl.h:12,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/PyBindTensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:1:
      /usr/include/c++/11/bits/stl_algobase.h: In instantiation of ‘static _OI std::__copy_move<false, false, std::random_access_iterator_tag>::__copy_m(_II, _II, _OI) [with _II = const long int*; _OI = __half*]’:
      /usr/include/c++/11/bits/stl_algobase.h:495:30:   required from ‘_OI std::__copy_move_a2(_II, _II, _OI) [with bool _IsMove = false; _II = const long int*; _OI = __half*]’
      /usr/include/c++/11/bits/stl_algobase.h:522:42:   required from ‘_OI std::__copy_move_a1(_II, _II, _OI) [with bool _IsMove = false; _II = const long int*; _OI = __half*]’
      /usr/include/c++/11/bits/stl_algobase.h:530:31:   required from ‘_OI std::__copy_move_a(_II, _II, _OI) [with bool _IsMove = false; _II = const long int*; _OI = __half*]’
      /usr/include/c++/11/bits/stl_algobase.h:620:7:   required from ‘_OI std::copy(_II, _II, _OI) [with _II = const long int*; _OI = __half*]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:1643:24:   required from ‘tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>::<lambda(auto:28)>::<lambda(auto:29)> [with auto:29 = tv::detail::_identity]’
      /home/matt/Code/cumm/include/tensorview/core/cc17.h:124:47:   [ skipping 12 instantiation contexts, use -ftemplate-backtrace-limit=0 to disable ]
      /home/matt/Code/cumm/include/tensorview/core/mp_helper.h:106:26:   required from ‘constexpr F tv::detail::mp_for_each_impl(tv::mp_list<Ts ...>, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::dispatch_noexcept<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)> >(tv::DType, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>&&)::<lambda(auto:3)>]’
      /home/matt/Code/cumm/include/tensorview/core/mp_helper.h:497:34:   required from ‘constexpr F tv::mp_for_each(F&&) [with L = tv::mp_list<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16>; F = tv::dispatch_noexcept<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)> >(tv::DType, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>&&)::<lambda(auto:3)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:372:3:   required from ‘bool tv::dispatch_noexcept(tv::DType, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:384:32:   required from ‘void tv::dispatch(tv::DType, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:526:29:   required from ‘void tv::Dispatch<T<Args ...> >::operator()(tv::DType, F&&) [with F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>; T = std::tuple; Args = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:1614:43:   required from here
      /usr/include/c++/11/bits/stl_algobase.h:385:25: error: ambiguous overload for ‘operator=’ (operand types are ‘__half’ and ‘const long int’)
        385 |               *__result = *__first;
            |               ~~~~~~~~~~^~~~~~~~~~
      In file included from /usr/local/cuda/include/cuda_fp16.h:4748,
                       from /home/matt/Code/cumm/include/tensorview/dtypes.h:18,
                       from /home/matt/Code/cumm/include/tensorview/tensorview.h:28,
                       from /home/matt/Code/cumm/include/tensorview/cuda/driverops.h:16,
                       from /home/matt/Code/cumm/include/tensorview/contexts/core.h:20,
                       from /home/matt/Code/cumm/include/tensorview/context.h:2,
                       from /home/matt/Code/cumm/include/tensorview/tensor.h:94,
                       from /home/matt/Code/cumm/include/tensorview/pybind.h:16,
                       from /home/matt/Code/cumm/include/tensorview/pybind_utils.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/TensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:2:
      /usr/local/cuda/include/cuda_fp16.hpp:149:50: note: candidate: ‘__half& __half::operator=(float)’
        149 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const float f) { __x = __float2half(f).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:150:50: note: candidate: ‘__half& __half::operator=(double)’
        150 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const double f) { __x = __double2half(f).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:257:50: note: candidate: ‘__half& __half::operator=(short int)’
        257 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:258:50: note: candidate: ‘__half& __half::operator=(short unsigned int)’
        258 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:259:50: note: candidate: ‘__half& __half::operator=(int)’
        259 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:260:50: note: candidate: ‘__half& __half::operator=(unsigned int)’
        260 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:261:50: note: candidate: ‘__half& __half::operator=(long long int)’
        261 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:262:50: note: candidate: ‘__half& __half::operator=(long long unsigned int)’
        262 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      In file included from /home/matt/Code/cumm/include/tensorview/dtypes.h:18,
                       from /home/matt/Code/cumm/include/tensorview/tensorview.h:28,
                       from /home/matt/Code/cumm/include/tensorview/cuda/driverops.h:16,
                       from /home/matt/Code/cumm/include/tensorview/contexts/core.h:20,
                       from /home/matt/Code/cumm/include/tensorview/context.h:2,
                       from /home/matt/Code/cumm/include/tensorview/tensor.h:94,
                       from /home/matt/Code/cumm/include/tensorview/pybind.h:16,
                       from /home/matt/Code/cumm/include/tensorview/pybind_utils.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/TensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:2:
      /usr/local/cuda/include/cuda_fp16.h:4210:26: note: candidate: ‘constexpr __half& __half::operator=(const __half&)’
       4210 | struct __CUDA_ALIGN__(2) __half {
            |                          ^~~~~~
      /usr/local/cuda/include/cuda_fp16.h:4210:26: note: candidate: ‘constexpr __half& __half::operator=(__half&&)’
      In file included from /usr/include/c++/11/bits/forward_list.h:38,
                       from /usr/include/c++/11/forward_list:38,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/../detail/common.h:306,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/../attr.h:13,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h:12,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/pybind11.h:13,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/stl.h:12,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/PyBindTensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:1:
      /usr/include/c++/11/bits/stl_algobase.h: In instantiation of ‘static _OI std::__copy_move<false, false, std::random_access_iterator_tag>::__copy_m(_II, _II, _OI) [with _II = const long unsigned int*; _OI = __half*]’:
      /usr/include/c++/11/bits/stl_algobase.h:495:30:   required from ‘_OI std::__copy_move_a2(_II, _II, _OI) [with bool _IsMove = false; _II = const long unsigned int*; _OI = __half*]’
      /usr/include/c++/11/bits/stl_algobase.h:522:42:   required from ‘_OI std::__copy_move_a1(_II, _II, _OI) [with bool _IsMove = false; _II = const long unsigned int*; _OI = __half*]’
      /usr/include/c++/11/bits/stl_algobase.h:530:31:   required from ‘_OI std::__copy_move_a(_II, _II, _OI) [with bool _IsMove = false; _II = const long unsigned int*; _OI = __half*]’
      /usr/include/c++/11/bits/stl_algobase.h:620:7:   required from ‘_OI std::copy(_II, _II, _OI) [with _II = const long unsigned int*; _OI = __half*]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:1643:24:   required from ‘tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>::<lambda(auto:28)>::<lambda(auto:29)> [with auto:29 = tv::detail::_identity]’
      /home/matt/Code/cumm/include/tensorview/core/cc17.h:124:47:   [ skipping 12 instantiation contexts, use -ftemplate-backtrace-limit=0 to disable ]
      /home/matt/Code/cumm/include/tensorview/core/mp_helper.h:106:26:   required from ‘constexpr F tv::detail::mp_for_each_impl(tv::mp_list<Ts ...>, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::dispatch_noexcept<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)> >(tv::DType, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>&&)::<lambda(auto:3)>]’
      /home/matt/Code/cumm/include/tensorview/core/mp_helper.h:497:34:   required from ‘constexpr F tv::mp_for_each(F&&) [with L = tv::mp_list<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16>; F = tv::dispatch_noexcept<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)> >(tv::DType, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>&&)::<lambda(auto:3)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:372:3:   required from ‘bool tv::dispatch_noexcept(tv::DType, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:384:32:   required from ‘void tv::dispatch(tv::DType, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:526:29:   required from ‘void tv::Dispatch<T<Args ...> >::operator()(tv::DType, F&&) [with F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>; T = std::tuple; Args = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:1614:43:   required from here
      /usr/include/c++/11/bits/stl_algobase.h:385:25: error: ambiguous overload for ‘operator=’ (operand types are ‘__half’ and ‘const long unsigned int’)
        385 |               *__result = *__first;
            |               ~~~~~~~~~~^~~~~~~~~~
      In file included from /usr/local/cuda/include/cuda_fp16.h:4748,
                       from /home/matt/Code/cumm/include/tensorview/dtypes.h:18,
                       from /home/matt/Code/cumm/include/tensorview/tensorview.h:28,
                       from /home/matt/Code/cumm/include/tensorview/cuda/driverops.h:16,
                       from /home/matt/Code/cumm/include/tensorview/contexts/core.h:20,
                       from /home/matt/Code/cumm/include/tensorview/context.h:2,
                       from /home/matt/Code/cumm/include/tensorview/tensor.h:94,
                       from /home/matt/Code/cumm/include/tensorview/pybind.h:16,
                       from /home/matt/Code/cumm/include/tensorview/pybind_utils.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/TensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:2:
      /usr/local/cuda/include/cuda_fp16.hpp:149:50: note: candidate: ‘__half& __half::operator=(float)’
        149 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const float f) { __x = __float2half(f).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:150:50: note: candidate: ‘__half& __half::operator=(double)’
        150 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const double f) { __x = __double2half(f).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:257:50: note: candidate: ‘__half& __half::operator=(short int)’
        257 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:258:50: note: candidate: ‘__half& __half::operator=(short unsigned int)’
        258 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:259:50: note: candidate: ‘__half& __half::operator=(int)’
        259 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:260:50: note: candidate: ‘__half& __half::operator=(unsigned int)’
        260 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:261:50: note: candidate: ‘__half& __half::operator=(long long int)’
        261 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      /usr/local/cuda/include/cuda_fp16.hpp:262:50: note: candidate: ‘__half& __half::operator=(long long unsigned int)’
        262 | __CUDA_HOSTDEVICE__ __CUDA_FP16_INLINE__ __half &__half::operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
            |                                                  ^~~~~~
      In file included from /home/matt/Code/cumm/include/tensorview/dtypes.h:18,
                       from /home/matt/Code/cumm/include/tensorview/tensorview.h:28,
                       from /home/matt/Code/cumm/include/tensorview/cuda/driverops.h:16,
                       from /home/matt/Code/cumm/include/tensorview/contexts/core.h:20,
                       from /home/matt/Code/cumm/include/tensorview/context.h:2,
                       from /home/matt/Code/cumm/include/tensorview/tensor.h:94,
                       from /home/matt/Code/cumm/include/tensorview/pybind.h:16,
                       from /home/matt/Code/cumm/include/tensorview/pybind_utils.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/TensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:2:
      /usr/local/cuda/include/cuda_fp16.h:4210:26: note: candidate: ‘constexpr __half& __half::operator=(const __half&)’
       4210 | struct __CUDA_ALIGN__(2) __half {
            |                          ^~~~~~
      /usr/local/cuda/include/cuda_fp16.h:4210:26: note: candidate: ‘constexpr __half& __half::operator=(__half&&)’
      In file included from /usr/include/c++/11/bits/forward_list.h:38,
                       from /usr/include/c++/11/forward_list:38,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/../detail/common.h:306,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/../attr.h:13,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h:12,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/pybind11.h:13,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/stl.h:12,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/PyBindTensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:1:
      /usr/include/c++/11/bits/stl_algobase.h: In instantiation of ‘static _OI std::__copy_move<false, false, std::random_access_iterator_tag>::__copy_m(_II, _II, _OI) [with _II = const long int*; _OI = __nv_bfloat16*]’:
      /usr/include/c++/11/bits/stl_algobase.h:495:30:   required from ‘_OI std::__copy_move_a2(_II, _II, _OI) [with bool _IsMove = false; _II = const long int*; _OI = __nv_bfloat16*]’
      /usr/include/c++/11/bits/stl_algobase.h:522:42:   required from ‘_OI std::__copy_move_a1(_II, _II, _OI) [with bool _IsMove = false; _II = const long int*; _OI = __nv_bfloat16*]’
      /usr/include/c++/11/bits/stl_algobase.h:530:31:   required from ‘_OI std::__copy_move_a(_II, _II, _OI) [with bool _IsMove = false; _II = const long int*; _OI = __nv_bfloat16*]’
      /usr/include/c++/11/bits/stl_algobase.h:620:7:   required from ‘_OI std::copy(_II, _II, _OI) [with _II = const long int*; _OI = __nv_bfloat16*]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:1643:24:   required from ‘tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>::<lambda(auto:28)>::<lambda(auto:29)> [with auto:29 = tv::detail::_identity]’
      /home/matt/Code/cumm/include/tensorview/core/cc17.h:124:47:   [ skipping 12 instantiation contexts, use -ftemplate-backtrace-limit=0 to disable ]
      /home/matt/Code/cumm/include/tensorview/core/mp_helper.h:106:26:   required from ‘constexpr F tv::detail::mp_for_each_impl(tv::mp_list<Ts ...>, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::dispatch_noexcept<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)> >(tv::DType, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>&&)::<lambda(auto:3)>]’
      /home/matt/Code/cumm/include/tensorview/core/mp_helper.h:497:34:   required from ‘constexpr F tv::mp_for_each(F&&) [with L = tv::mp_list<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16>; F = tv::dispatch_noexcept<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)> >(tv::DType, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>&&)::<lambda(auto:3)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:372:3:   required from ‘bool tv::dispatch_noexcept(tv::DType, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:384:32:   required from ‘void tv::dispatch(tv::DType, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:526:29:   required from ‘void tv::Dispatch<T<Args ...> >::operator()(tv::DType, F&&) [with F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>; T = std::tuple; Args = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:1614:43:   required from here
      /usr/include/c++/11/bits/stl_algobase.h:385:25: error: ambiguous overload for ‘operator=’ (operand types are ‘__nv_bfloat16’ and ‘const long int’)
        385 |               *__result = *__first;
            |               ~~~~~~~~~~^~~~~~~~~~
      In file included from /usr/local/cuda/include/cuda_bf16.h:4756,
                       from /home/matt/Code/cumm/include/tensorview/dtypes.h:21,
                       from /home/matt/Code/cumm/include/tensorview/tensorview.h:28,
                       from /home/matt/Code/cumm/include/tensorview/cuda/driverops.h:16,
                       from /home/matt/Code/cumm/include/tensorview/contexts/core.h:20,
                       from /home/matt/Code/cumm/include/tensorview/context.h:2,
                       from /home/matt/Code/cumm/include/tensorview/tensor.h:94,
                       from /home/matt/Code/cumm/include/tensorview/pybind.h:16,
                       from /home/matt/Code/cumm/include/tensorview/pybind_utils.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/TensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:2:
      /usr/local/cuda/include/cuda_bf16.hpp:123:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(float)’
        123 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:124:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(double)’
        124 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:238:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(short int)’
        238 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:239:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(short unsigned int)’
        239 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:240:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(int)’
        240 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:241:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(unsigned int)’
        241 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:242:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(long long int)’
        242 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:243:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(long long unsigned int)’
        243 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      In file included from /home/matt/Code/cumm/include/tensorview/dtypes.h:21,
                       from /home/matt/Code/cumm/include/tensorview/tensorview.h:28,
                       from /home/matt/Code/cumm/include/tensorview/cuda/driverops.h:16,
                       from /home/matt/Code/cumm/include/tensorview/contexts/core.h:20,
                       from /home/matt/Code/cumm/include/tensorview/context.h:2,
                       from /home/matt/Code/cumm/include/tensorview/tensor.h:94,
                       from /home/matt/Code/cumm/include/tensorview/pybind.h:16,
                       from /home/matt/Code/cumm/include/tensorview/pybind_utils.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/TensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:2:
      /usr/local/cuda/include/cuda_bf16.h:4215:26: note: candidate: ‘constexpr __nv_bfloat16& __nv_bfloat16::operator=(const __nv_bfloat16&)’
       4215 | struct __CUDA_ALIGN__(2) __nv_bfloat16 {
            |                          ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.h:4215:26: note: candidate: ‘constexpr __nv_bfloat16& __nv_bfloat16::operator=(__nv_bfloat16&&)’
      In file included from /usr/include/c++/11/bits/forward_list.h:38,
                       from /usr/include/c++/11/forward_list:38,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/../detail/common.h:306,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/../attr.h:13,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h:12,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/pybind11.h:13,
                       from /tmp/pip-build-env-sh7fzxa7/overlay/lib/python3.11/site-packages/pybind11/include/pybind11/stl.h:12,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/PyBindTensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:1:
      /usr/include/c++/11/bits/stl_algobase.h: In instantiation of ‘static _OI std::__copy_move<false, false, std::random_access_iterator_tag>::__copy_m(_II, _II, _OI) [with _II = const long unsigned int*; _OI = __nv_bfloat16*]’:
      /usr/include/c++/11/bits/stl_algobase.h:495:30:   required from ‘_OI std::__copy_move_a2(_II, _II, _OI) [with bool _IsMove = false; _II = const long unsigned int*; _OI = __nv_bfloat16*]’
      /usr/include/c++/11/bits/stl_algobase.h:522:42:   required from ‘_OI std::__copy_move_a1(_II, _II, _OI) [with bool _IsMove = false; _II = const long unsigned int*; _OI = __nv_bfloat16*]’
      /usr/include/c++/11/bits/stl_algobase.h:530:31:   required from ‘_OI std::__copy_move_a(_II, _II, _OI) [with bool _IsMove = false; _II = const long unsigned int*; _OI = __nv_bfloat16*]’
      /usr/include/c++/11/bits/stl_algobase.h:620:7:   required from ‘_OI std::copy(_II, _II, _OI) [with _II = const long unsigned int*; _OI = __nv_bfloat16*]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:1643:24:   required from ‘tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>::<lambda(auto:28)>::<lambda(auto:29)> [with auto:29 = tv::detail::_identity]’
      /home/matt/Code/cumm/include/tensorview/core/cc17.h:124:47:   [ skipping 12 instantiation contexts, use -ftemplate-backtrace-limit=0 to disable ]
      /home/matt/Code/cumm/include/tensorview/core/mp_helper.h:106:26:   required from ‘constexpr F tv::detail::mp_for_each_impl(tv::mp_list<Ts ...>, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::dispatch_noexcept<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)> >(tv::DType, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>&&)::<lambda(auto:3)>]’
      /home/matt/Code/cumm/include/tensorview/core/mp_helper.h:497:34:   required from ‘constexpr F tv::mp_for_each(F&&) [with L = tv::mp_list<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16>; F = tv::dispatch_noexcept<float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)> >(tv::DType, tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>&&)::<lambda(auto:3)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:372:3:   required from ‘bool tv::dispatch_noexcept(tv::DType, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:384:32:   required from ‘void tv::dispatch(tv::DType, F&&) [with Ts = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}; F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:526:29:   required from ‘void tv::Dispatch<T<Args ...> >::operator()(tv::DType, F&&) [with F = tv::Tensor::astype(tv::DType, bool) const::<lambda(auto:27)>; T = std::tuple; Args = {float, double, signed char, short int, int, long int, unsigned char, short unsigned int, unsigned int, long unsigned int, bool, __half, __nv_bfloat16}]’
      /home/matt/Code/cumm/include/tensorview/tensor.h:1614:43:   required from here
      /usr/include/c++/11/bits/stl_algobase.h:385:25: error: ambiguous overload for ‘operator=’ (operand types are ‘__nv_bfloat16’ and ‘const long unsigned int’)
        385 |               *__result = *__first;
            |               ~~~~~~~~~~^~~~~~~~~~
      In file included from /usr/local/cuda/include/cuda_bf16.h:4756,
                       from /home/matt/Code/cumm/include/tensorview/dtypes.h:21,
                       from /home/matt/Code/cumm/include/tensorview/tensorview.h:28,
                       from /home/matt/Code/cumm/include/tensorview/cuda/driverops.h:16,
                       from /home/matt/Code/cumm/include/tensorview/contexts/core.h:20,
                       from /home/matt/Code/cumm/include/tensorview/context.h:2,
                       from /home/matt/Code/cumm/include/tensorview/tensor.h:94,
                       from /home/matt/Code/cumm/include/tensorview/pybind.h:16,
                       from /home/matt/Code/cumm/include/tensorview/pybind_utils.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/TensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:2:
      /usr/local/cuda/include/cuda_bf16.hpp:123:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(float)’
        123 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:124:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(double)’
        124 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:238:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(short int)’
        238 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:239:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(short unsigned int)’
        239 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:240:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(int)’
        240 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:241:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(unsigned int)’
        241 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:242:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(long long int)’
        242 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.hpp:243:61: note: candidate: ‘__nv_bfloat16& __nv_bfloat16::operator=(long long unsigned int)’
        243 |     __CUDA_HOSTDEVICE__ __CUDA_BF16_INLINE__ __nv_bfloat16 &__nv_bfloat16::operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; }
            |                                                             ^~~~~~~~~~~~~
      In file included from /home/matt/Code/cumm/include/tensorview/dtypes.h:21,
                       from /home/matt/Code/cumm/include/tensorview/tensorview.h:28,
                       from /home/matt/Code/cumm/include/tensorview/cuda/driverops.h:16,
                       from /home/matt/Code/cumm/include/tensorview/contexts/core.h:20,
                       from /home/matt/Code/cumm/include/tensorview/context.h:2,
                       from /home/matt/Code/cumm/include/tensorview/tensor.h:94,
                       from /home/matt/Code/cumm/include/tensorview/pybind.h:16,
                       from /home/matt/Code/cumm/include/tensorview/pybind_utils.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/include/tensorview_bind/TensorViewBind.h:2,
                       from /home/matt/Code/cumm/build/temp.linux-x86_64-cpython-311/cumm/build/core_cc/src/tensorview_bind/PyBindTensorViewBind/PyBindTensorViewBind_bind_TensorViewBind.cc:2:
      /usr/local/cuda/include/cuda_bf16.h:4215:26: note: candidate: ‘constexpr __nv_bfloat16& __nv_bfloat16::operator=(const __nv_bfloat16&)’
       4215 | struct __CUDA_ALIGN__(2) __nv_bfloat16 {
            |                          ^~~~~~~~~~~~~
      /usr/local/cuda/include/cuda_bf16.h:4215:26: note: candidate: ‘constexpr __nv_bfloat16& __nv_bfloat16::operator=(__nv_bfloat16&&)’
FindDefinition / cumm

Unable to compile cumm (fails with "error: ambiguous overload for ‘operator=’") #23