torch.argsort crashes when tensor is on Neuron device

For some reason, torch.argsort is crashing when the tensor is on a Neuron device. For example, the code snippet bellow works as expected (on CPU):

>>> import torch
>>> import torch_neuronx
>>> x = torch.randint(10, [10,1])
>>> torch.argsort(x, dim=0)
tensor([[1],
        [8],
        [6],
        [4],
        [3],
        [7],
        [0],
        [5],
        [2],
        [9]])

But when I move x to Neuron device:

>>> import torch_xla.core.xla_model as xm
>>> x = torch.randint(10, [10, 1]).to(xm.xla_device())
>>> torch.argsort(x, dim=0)

It's crashing:

2024-04-11 11:25:10.000599:  18868  INFO ||NEURON_CACHE||: Compile cache path: /var/tmp/neuron-compile-cache
2024-04-11 11:25:10.000601:  18868  ERROR ||NEURON_CC_WRAPPER||: Got a cached failed neff at /var/tmp/neuron-compile-cache/neuronxcc-2.13.66.0+6dfecc895/MODULE_1494686516964995913+d41d8cd9/model.neff. Will skip compilation, please set --retry_failed_compilation for recompilation: 
 Failed compilation with ['neuronx-cc', 'compile', '--target=trn1', '--framework=XLA', '/tmp/ubuntu/neuroncc_compile_workdir/e7a868ef-54fd-4c90-9210-6674cde1d517/model.MODULE_1494686516964995913+d41d8cd9.hlo_module.pb', '--output', '/tmp/ubuntu/neuroncc_compile_workdir/e7a868ef-54fd-4c90-9210-6674cde1d517/model.MODULE_1494686516964995913+d41d8cd9.neff', '--verbose=35']: 2024-04-11T10:42:56Z [TEN404] (_sort.12) Internal tensorizer error - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new
.
2024-04-11 11:25:10.636084: F ./torch_xla/csrc/runtime/debug_macros.h:20] Non-OK-status: status.status() status: INTERNAL: RunNeuronCCImpl: error condition error != 0: <class 'subprocess.CalledProcessError'>: Command '' died with <Signals.SIGHUP: 1>.
*** Begin stack trace ***
        tsl::CurrentStackTrace()
        std::unique_ptr<xla::PjRtLoadedExecutable, std::default_delete<xla::PjRtLoadedExecutable> > ConsumeValue<std::unique_ptr<xla::PjRtLoadedExecutable, std::default_delete<xla::PjRtLoadedExecutable> > >(absl::lts_20230125::StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable, std::default_delete<xla::PjRtLoadedExecutable> > >&&)
        torch_xla::runtime::PjRtComputationClient::Compile(std::vector<torch_xla::runtime::ComputationClient::CompileInstance, std::allocator<torch_xla::runtime::ComputationClient::CompileInstance> >)
        torch_xla::XLAGraphExecutor::Compile(std::vector<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> >, std::allocator<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> > > > const&, absl::lts_20230125::Span<std::string const>, torch::lazy::LazyGraphExecutor::SyncTensorCollection const&, torch::lazy::LazyGraphExecutor::PostOrderData*, std::vector<torch::lazy::Value, std::allocator<torch::lazy::Value> > const&)
        torch_xla::XLAGraphExecutor::SyncTensorsGraphInternal(std::vector<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> >, std::allocator<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> > > >*, absl::lts_20230125::Span<std::string const>, torch::lazy::LazyGraphExecutor::SyncTensorsConfig const&, bool)
        torch_xla::XLAGraphExecutor::SyncTensorsGraph(std::vector<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> >, std::allocator<c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> > > >*, absl::lts_20230125::Span<std::string const>, bool, bool, bool)
        torch_xla::XLATensor::ApplyPendingGraph()
        torch_xla::XLATensor::GetXlaData()
        torch_xla::XLATensor::ToTensor(bool)
        torch_xla::XLANativeFunctions::_to_copy(at::Tensor const&, c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>, bool, c10::optional<c10::MemoryFormat>)

        at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>, bool, c10::optional<c10::MemoryFormat>)

        at::_ops::_to_copy::redispatch(c10::DispatchKeySet, at::Tensor const&, c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>, bool, c10::optional<c10::MemoryFormat>)

        at::_ops::_to_copy::call(at::Tensor const&, c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>, bool, c10::optional<c10::MemoryFormat>)
        at::native::to(at::Tensor const&, c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>, bool, bool, c10::optional<c10::MemoryFormat>)

        at::_ops::to_dtype_layout::call(at::Tensor const&, c10::optional<c10::ScalarType>, c10::optional<c10::Layout>, c10::optional<c10::Device>, c10::optional<bool>, bool, bool, c10::optional<c10::MemoryFormat>)

        _PyEval_EvalFrameDefault

        _PyFunction_Vectorcall
        _PyEval_EvalFrameDefault

        _PyFunction_Vectorcall
        _PyEval_EvalFrameDefault

        _PyFunction_Vectorcall

        PyObject_Repr
        PyFile_WriteObject

        _PyEval_EvalFrameDefault

        _PyEval_EvalCodeWithName
        PyEval_EvalCodeEx
        PyEval_EvalCode

        PyRun_InteractiveLoopFlags
        PyRun_AnyFileExFlags
        Py_RunMain
        Py_BytesMain
        __libc_start_main
        _start
*** End stack trace ***

Aborted (core dumped)

Output of pip list for the relevant packages:

libneuronxla==2.0.965
neuronx-cc==2.13.66.0+6dfecc895
numpy==1.25.2
torch==2.1.2
torch-neuronx==2.1.2.2.1.0
torch-xla==2.1.2
torchmetrics==0.10.3
torchvision==0.16.2

EC2 Instance: trn1.32.xlarge Instance ID: i-08d5b389cb04f873d Neuron Runtime: aws-neuronx-runtime-lib/unknown,now 2.20.22.0-1b3ca6425 OS: Ubuntu 20

aws-neuron / aws-neuron-sdk

torch.argsort crashes when tensor is on Neuron device #868