Closed tengerye closed 1 year ago
@tengerye thanks for reaching out. We will look into this issue and update. Meanwhile, can you also provide the complete terminal log along with rank/backend initialization outputs ?
@jerome-habana Hi, sure. The following is the complete log. Please let me know if I can help.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: True, using: 8 HPUs
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/8
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/8
Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/8
Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/8
Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/8
----------------------------------------------------------------------------------------------------
distributed_backend=hccl
All distributed processes registered. Starting with 8 processes
----------------------------------------------------------------------------------------------------
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7fdc708b4f50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7fdc708b58fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7fdcccc58f10]
/usr/lib/habanalabs/libhcl.so(hccl::detect_tcp_ifs(std::vector<hccl::detected_tcp_if, std::allocator<hccl::detected_tcp_if> >&)+0x16a) [0x7fdcb61a793a]
/usr/lib/habanalabs/libhcl.so(hccl::detect_tcp_if()+0x39) [0x7fdcb61a80e9]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_coordinator::create(bool)+0x1f2) [0x7fdcb616f092]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_context::add_new_coordinator(bool)+0x34) [0x7fdcb615f294]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_context::get_unique_id(hcclUniqueId*)+0x20c) [0x7fdcb616011c]
/usr/lib/habanalabs/libhcl.so(hcclGetUniqueId_Original(hcclUniqueId*)+0x2f) [0x7fdcb611e67f]
/usr/lib/habanalabs/libhcl.so(hcclGetUniqueId_impl(hcclUniqueId*)+0x38) [0x7fdcb6122ac8]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/lib/libpytorch_synapse_helpers.so(hcclGetUniqueId+0x33) [0x7fdb767f2ff3]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::getComm(int)+0xfd) [0x7fdb75cb4eed]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::getCommList(std::vector<int, std::allocator<int> > const&)+0x12a) [0x7fdb75cb556a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::broadcast(std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::BroadcastOptions const&)+0x46c) [0x7fdb75cb8e1c]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(+0x464a6a9) [0x7fdcc34e26a9]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(+0x4650208) [0x7fdcc34e8208]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(c10d::ops::broadcast(c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, c10::ArrayRef<at::Tensor>, c10d::BroadcastOptions const&)+0x255) [0x7fdcc34e53d5]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x94d5f5) [0x7fdcc9a0d5f5]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x2279f7) [0x7fdcc92e79f7]
python() [0x5d80be]
python(_PyObject_MakeTpCall+0x34c) [0x5d8d8c]
python() [0x4fedd4]
python() [0x4997c7]
python(_PyEval_EvalCodeWithName+0x481) [0x55cd91]
python(_PyFunction_Vectorcall+0x1e1) [0x5d8941]
python() [0x49abe4]
python(_PyEval_EvalCodeWithName+0x481) [0x55cd91]
python(_PyFunction_Vectorcall+0x1e1) [0x5d8941]
python() [0x49abe4]
python(_PyEval_EvalCodeWithName+0x481) [0x55cd91]
Internal Error: Received signal - Segmentation fault
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Traceback (most recent call last):
File "/projects/test/test.py", line 68, in <module>
trainer.fit(model=LitModel(), train_dataloaders=trainloader)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1495, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1828, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/strategies/hpu_parallel.py", line 138, in broadcast
broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/overrides/torch_distributed.py", line 135, in _broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Connection reset by peer
Files already downloaded and verified
Traceback (most recent call last):
File "/projects/test/test.py", line 68, in <module>
trainer.fit(model=LitModel(), train_dataloaders=trainloader)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1495, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1828, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/strategies/hpu_parallel.py", line 138, in broadcast
broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/overrides/torch_distributed.py", line 135, in _broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Connection reset by peer
Traceback (most recent call last):
File "/projects/test/test.py", line 68, in <module>
trainer.fit(model=LitModel(), train_dataloaders=trainloader)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1495, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1828, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/strategies/hpu_parallel.py", line 138, in broadcast
broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/overrides/torch_distributed.py", line 135, in _broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Connection reset by peer
Traceback (most recent call last):
File "/projects/test/test.py", line 68, in <module>
trainer.fit(model=LitModel(), train_dataloaders=trainloader)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1495, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1828, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/strategies/hpu_parallel.py", line 138, in broadcast
broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/overrides/torch_distributed.py", line 135, in _broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Connection reset by peer
Traceback (most recent call last):
File "/projects/test/test.py", line 68, in <module>
trainer.fit(model=LitModel(), train_dataloaders=trainloader)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1495, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1828, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/strategies/hpu_parallel.py", line 138, in broadcast
broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/overrides/torch_distributed.py", line 135, in _broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Connection reset by peer
Files already downloaded and verified
Traceback (most recent call last):
File "/projects/test/test.py", line 68, in <module>
trainer.fit(model=LitModel(), train_dataloaders=trainloader)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1495, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1828, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/strategies/hpu_parallel.py", line 138, in broadcast
broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/overrides/torch_distributed.py", line 135, in _broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Connection reset by peer
Traceback (most recent call last):
File "/projects/test/test.py", line 68, in <module>
trainer.fit(model=LitModel(), train_dataloaders=trainloader)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 603, in fit
call._call_and_handle_interrupt(
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/call.py", line 38, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 645, in _fit_impl
self._run(model, ckpt_path=self.ckpt_path)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1035, in _run
self.__setup_profiler()
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1495, in __setup_profiler
self.profiler.setup(stage=self.state.fn, local_rank=local_rank, log_dir=self.log_dir)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/trainer.py", line 1828, in log_dir
dirpath = self.strategy.broadcast(dirpath)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/strategies/hpu_parallel.py", line 138, in broadcast
broadcast_object_list(obj, src, group=_group.WORLD)
File "/usr/local/lib/python3.8/dist-packages/pytorch_lightning/overrides/torch_distributed.py", line 135, in _broadcast_object_list
broadcast(object_sizes_tensor, src=src, group=group)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Connection reset by peer
Hi @tengerye , this is the internal gaudi nic communication issue, some ports may be down, should be resolved by removeing and reloading habana device drivers:
$sudo modprobe -r habanalabs habanalabs_en # to remove drivers $sudo modprobe habanalabs habanalabs_en # to reload drivers
Please try to run your script again after that. By the way, your script is ok.
Hi @ytang-habana thank you so much for your kind reply. I execute the above commands twice. The first time NOT in the docker container without any error.
The second time in the docker container, but I have the following errors for executing the commands.
root@tye-ec2:/projects/test# modprobe -r habanalabs habanalabs_en
root@tye-ec2:/projects/test# modprobe habanalabs habanalabs_en
modprobe: FATAL: Module habanalabs not found in directory /lib/modules/5.4.0-1065-aws
However, I have to run my script inside the container. The command I used to start container is:
docker run -it -p 8888:8888 -p 6006:6006 -v MY_PATH:/projects --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host MY_IMAGE
@tengerye , before you start the container with "docker run" command, you need to run: modprobe -r habanalabs habanalabs_en modprobe habanalabs habanalabs_en after that, you start container, because docker is application layer. it stays on top of device driver.
@ytang-habana Hi, thank you for your kind reply. Here is what I did just now:
sudo modprobe -r habanalabs habanalabs_en
and sudo modprobe habanalabs habanalabs_en
.docker run -it -p 8888:8888 -p 6006:6006 -v MY_PATH:/projects --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host MY_IMAGE
.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: True, using: 4 HPUs
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/4
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/4
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/4
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/4
----------------------------------------------------------------------------------------------------
distributed_backend=hccl
All distributed processes registered. Starting with 4 processes
----------------------------------------------------------------------------------------------------
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7ff9ab250f50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t, void)+0x5e) [0x7ff9ab2518fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7ffa135f8f10]
/usr/lib/habanalabs/libhcl.so(hccl::detect_tcp_ifs(std::vector<hccl::detected_tcp_if, std::allocator
Hi @tengerye, 1) first launch $modprobe -r habanalabs habanalabs_en $modprobe habanalabs habanalabs_en in AMI
2) Please run
$hl-smi and you may get something like:
+-----------------------------------------------------------------------------+
| HL-SMI Version: hl-1.7.1-fw-38.3.0.0 |
| Driver Version: 1.7.1-68c1a21 |
|-------------------------------+----------------------+----------------------+
| AIP Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | AIP-Util Compute M. |
|===============================+======================+======================|
| 0 HL-205 N/A | 0000:19:00.0 N/A | 0 |
| N/A 44C N/A 106W / 350W | 512Mib / 32768Mib | 16% N/A |
|-------------------------------+----------------------+----------------------+
| 1 HL-205 N/A | 0000:b3:00.0 N/A | 0 |
| N/A 42C N/A 108W / 350W | 512Mib / 32768Mib | 16% N/A |
|-------------------------------+----------------------+----------------------+
| 2 HL-205 N/A | 0000:1a:00.0 N/A | 0 |
| N/A 36C N/A 113W / 350W | 512Mib / 32768Mib | 18% N/A |
|-------------------------------+----------------------+----------------------+
| 3 HL-205 N/A | 0000:b4:00.0 N/A | 0 |
| N/A 38C N/A 105W / 350W | 512Mib / 32768Mib | 15% N/A |
|-------------------------------+----------------------+----------------------+
| 4 HL-205 N/A | 0000:33:00.0 N/A | 0 |
| N/A 35C N/A 105W / 350W | 512Mib / 32768Mib | 15% N/A |
|-------------------------------+----------------------+----------------------+
| 5 HL-205 N/A | 0000:cc:00.0 N/A | 0 |
| N/A 35C N/A 107W / 350W | 512Mib / 32768Mib | 16% N/A |
|-------------------------------+----------------------+----------------------+
| 6 HL-205 N/A | 0000:34:00.0 N/A | 0 |
| N/A 39C N/A 100W / 350W | 512Mib / 32768Mib | 14% N/A |
|-------------------------------+----------------------+----------------------+
| 7 HL-205 N/A | 0000: cd :00.0 N/A | 0 |
| N/A 40C N/A 107W / 350W | 512Mib / 32768Mib | 16% N/A |
|-------------------------------+----------------------+----------------------+
| Compute Processes: AIP Memory |
| AIP PID Type Process name Usage |
|=============================================================================|
| 0 N/A N/A N/A N/A |
| 1 N/A N/A N/A N/A |
| 2 N/A N/A N/A N/A |
| 3 N/A N/A N/A N/A |
| 4 N/A N/A N/A N/A |
| 5 N/A N/A N/A N/A |
| 6 N/A N/A N/A N/A |
| 7 N/A N/A N/A N/A |
+=============================================================================+
3) check each Bus-Id one by one to make sure that all ports are up for each Bus-Id $hl-smi -n link -i 0000:19:00.0 port 0: UP port 2: UP port 3: UP port 4: UP port 5: UP port 6: UP port 7: UP ..... $hl-smi -n link -i 0000: cd :00.0 port 0: UP port 2: UP port 3: UP port 4: UP port 5: UP port 6: UP port 7: UP
make sure all the ports for 8 gaudi pci are up. (if you see some ports are not in UP state, you may need to discard this one and create another AMI to try.)
4) launch docker ( like what you did)
Hi @ytang-habana , I followed your kind instructions. Unfortunately, everything went well until the last step. It gave similar errors.
Hi @tengerye , Follow this doc: AWS DL1 Quick Start
I launch an EC2 DL1 instance and run the following docker, your PTL script works:
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --privileged vault.habana.ai/gaudi-docker/1.8.0/ubuntu20.04/habanalabs/pytorch-installer-1.13.1:latest
it works with the following logs:
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: True, using: 8 HPUs
Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/8
Initializing distributed: GLOBAL_RANK: 2, MEMBER: 3/8
Initializing distributed: GLOBAL_RANK: 3, MEMBER: 4/8
Initializing distributed: GLOBAL_RANK: 4, MEMBER: 5/8
Initializing distributed: GLOBAL_RANK: 5, MEMBER: 6/8
Initializing distributed: GLOBAL_RANK: 6, MEMBER: 7/8
Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/8
Initializing distributed: GLOBAL_RANK: 7, MEMBER: 8/8
----------------------------------------------------------------------------------------------------
distributed_backend=hccl
All distributed processes registered. Starting with 8 processes
----------------------------------------------------------------------------------------------------
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784300912 KB
=====================================================================================================================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784300912 KB
============================================================================================
.......
| Name | Type | Params
-------------------------------
0 | model | Net | 62.0 K
-------------------------------
62.0 K Trainable params
0 Non-trainable params
62.0 K Total params
0.248 Total estimated model params size (MB)
/usr/local/lib/python3.8/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:224: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 96 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
rank_zero_warn(
Long tensor unsupported on HPU, casting to float
Long tensor unsupported on HPU, casting to float
Files already downloaded and verified
Epoch 0: 0%| | 0/100 [00:00<?, ?it/s] Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified
[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
[W reducer.cpp:1298] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator())
Epoch 0: 100% .....
.....
Epoch 9: 100%|██████████| 100/100 [00:01<00:00, 71.92it/s, loss=1.67, v_num=1]`Trainer.fit` stopped: `max_epochs=10` reached.
Epoch 9: 100%|██████████| 100/100 [00:01<00:00, 70.82it/s, loss=1.67, v_num=1]
root@ip-172-16-9-118:~/test#
Hi @ytang-habana, it doesn't work either. Here is what I did:
First, I executed docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --privileged vault.habana.ai/gaudi-docker/1.8.0/ubuntu18.04/habanalabs/pytorch-installer-1.13.1:latest
Note, I used ubuntu18.04 instead of ubuntu20.04 because my machine is ubuntu18.04, I don't know if it is a problem.
Second, I executed git clone https://github.com/HabanaAI/Model-References.git
Third, I ran export PYTHONPATH=$PYTHONPATH:Model-References
and export PYTHON=/usr/bin/python3.8
.
Fourth, I executed $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 \ --gamma=0.7 --hpu --hmp \ --hmp-bf16=ops_bf16_mnist.txt \ --hmp-fp32=ops_fp32_mnist.txt \ --use_lazy_mode
. Everything goes well so far.
Fifth, I ran
mpirun -n 8 --bind-to core --map-by slot:PE=6 \ --rank-by core --report-bindings \ --allow-run-as-root \ $PYTHON mnist.py \ --batch-size=64 --epochs=1 \ --lr=1.0 --gamma=0.7 \ --hpu --hmp --hmp-bf16=ops_bf16_mnist.txt \ --hmp-fp32=ops_fp32_mnist.txt \ --use_lazy_mode
However, I got the following errors:
[tye-ec2:00527] MCW rank 1 bound to socket 0[core 6[hwt 0-1]], socket 0[core 7[hwt 0-1]], socket 0[core 8[hwt 0-1]], socket 0[core 9[hwt 0-1]], socket 0[core 10[hwt 0-1]], socket 0[core 11[hwt 0-1]]: [../../../../../../BB/BB/BB/BB/BB/BB/../../../../../../../../../../../..][../../../../../../../../../../../../../../../../../../../../../../../..]
[tye-ec2:00527] MCW rank 2 bound to socket 0[core 12[hwt 0-1]], socket 0[core 13[hwt 0-1]], socket 0[core 14[hwt 0-1]], socket 0[core 15[hwt 0-1]], socket 0[core 16[hwt 0-1]], socket 0[core 17[hwt 0-1]]: [../../../../../../../../../../../../BB/BB/BB/BB/BB/BB/../../../../../..][../../../../../../../../../../../../../../../../../../../../../../../..]
[tye-ec2:00527] MCW rank 3 bound to socket 0[core 18[hwt 0-1]], socket 0[core 19[hwt 0-1]], socket 0[core 20[hwt 0-1]], socket 0[core 21[hwt 0-1]], socket 0[core 22[hwt 0-1]], socket 0[core 23[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../BB/BB/BB/BB/BB/BB][../../../../../../../../../../../../../../../../../../../../../../../..]
[tye-ec2:00527] MCW rank 4 bound to socket 1[core 24[hwt 0-1]], socket 1[core 25[hwt 0-1]], socket 1[core 26[hwt 0-1]], socket 1[core 27[hwt 0-1]], socket 1[core 28[hwt 0-1]], socket 1[core 29[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../../../../../../..][BB/BB/BB/BB/BB/BB/../../../../../../../../../../../../../../../../../..]
[tye-ec2:00527] MCW rank 5 bound to socket 1[core 30[hwt 0-1]], socket 1[core 31[hwt 0-1]], socket 1[core 32[hwt 0-1]], socket 1[core 33[hwt 0-1]], socket 1[core 34[hwt 0-1]], socket 1[core 35[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../../../../../../..][../../../../../../BB/BB/BB/BB/BB/BB/../../../../../../../../../../../..]
[tye-ec2:00527] MCW rank 6 bound to socket 1[core 36[hwt 0-1]], socket 1[core 37[hwt 0-1]], socket 1[core 38[hwt 0-1]], socket 1[core 39[hwt 0-1]], socket 1[core 40[hwt 0-1]], socket 1[core 41[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../../../../../../..][../../../../../../../../../../../../BB/BB/BB/BB/BB/BB/../../../../../..]
[tye-ec2:00527] MCW rank 7 bound to socket 1[core 42[hwt 0-1]], socket 1[core 43[hwt 0-1]], socket 1[core 44[hwt 0-1]], socket 1[core 45[hwt 0-1]], socket 1[core 46[hwt 0-1]], socket 1[core 47[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../../../../../../..][../../../../../../../../../../../../../../../../../../BB/BB/BB/BB/BB/BB]
[tye-ec2:00527] MCW rank 0 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt 0-1]], socket 0[core 2[hwt 0-1]], socket 0[core 3[hwt 0-1]], socket 0[core 4[hwt 0-1]], socket 0[core 5[hwt 0-1]]: [BB/BB/BB/BB/BB/BB/../../../../../../../../../../../../../../../../../..][../../../../../../../../../../../../../../../../../../../../../../../..]
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
| distributed init (rank 5): env://
| distributed init (rank 6): env://
| distributed init (rank 7): env://
| distributed init (rank 0): env://
| distributed init (rank 4): env://
| distributed init (rank 2): env://
| distributed init (rank 3): env://
| distributed init (rank 1): env://
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784313276 KB
============================================================================================
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7fd9e6640f50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7fd9e66418fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7fda43c0bf10]
/usr/lib/habanalabs/libhcl.so(hccl::detect_tcp_ifs(std::vector<hccl::detected_tcp_if, std::allocator<hccl::detected_tcp_if> >&)+0x16a) [0x7fd9e54bd93a]
/usr/lib/habanalabs/libhcl.so(hccl::detect_tcp_if()+0x39) [0x7fd9e54be0e9]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_coordinator::create(bool)+0x1f2) [0x7fd9e5485092]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_context::add_new_coordinator(bool)+0x34) [0x7fd9e5475294]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_context::get_unique_id(hcclUniqueId*)+0x20c) [0x7fd9e547611c]
/usr/lib/habanalabs/libhcl.so(hcclGetUniqueId_Original(hcclUniqueId*)+0x2f) [0x7fd9e543467f]
/usr/lib/habanalabs/libhcl.so(hcclGetUniqueId_impl(hcclUniqueId*)+0x38) [0x7fd9e5438ac8]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/lib/libpytorch_synapse_helpers.so(hcclGetUniqueId+0x33) [0x7fd9f1f69ff3]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::getComm(int)+0xfd) [0x7fd9f142beed]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::getCommList(std::vector<int, std::allocator<int> > const&)+0x12a) [0x7fd9f142c56a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::allgather(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&)+0x7f3) [0x7fd9f14377c3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(+0x464a98d) [0x7fda3a45598d]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(+0x46502bc) [0x7fda3a45b2bc]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(c10d::ops::allgather(c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, c10d::AllgatherOptions const&)+0x104) [0x7fda3a458d64]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(c10d::verify_params_across_processes(c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, c10::optional<std::weak_ptr<c10d::Logger> > const&)+0x3bb) [0x7fda3a4ba77b]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x950cbf) [0x7fda40983cbf]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x2279f7) [0x7fda4025a9f7]
/usr/bin/python3.8() [0x5d80be]
/usr/bin/python3.8(_PyObject_MakeTpCall+0x34c) [0x5d8d8c]
/usr/bin/python3.8(_PyEval_EvalFrameDefault+0x4830) [0x561f80]
/usr/bin/python3.8(_PyEval_EvalCodeWithName+0x481) [0x55cd91]
/usr/bin/python3.8(_PyFunction_Vectorcall+0x1e1) [0x5d8941]
/usr/bin/python3.8() [0x4990ca]
/usr/bin/python3.8(_PyEval_EvalCodeWithName+0x768) [0x55d078]
/usr/bin/python3.8(_PyFunction_Vectorcall+0x1e1) [0x5d8941]
/usr/bin/python3.8(_PyObject_FastCallDict+0xc7) [0x5da107]
/usr/bin/python3.8() [0x586de6]
Internal Error: Received signal - Segmentation fault
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7fb9df5dcf50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4a) [0x7fb9df5dd79a]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7fba3b844ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7fba3b843b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7fba3b8444b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7fba3bb3ec0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7fb9ea3caa86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7fb9ea3cab14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7fb9ea3cacf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7fb9ea3cad59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7fb9ea3e760a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7fba391f925e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7fba391fa7ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7fba3cb8ac87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7fb9df5dcf50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7fb9df5dd8fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7fba3cba7f10]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7) [0x7fba3cba7e87]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x141) [0x7fba3cba97f1]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x8c957) [0x7fba3b83e957]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x68) [0x7fb9df5dd7b8]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7fba3b844ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7fba3b843b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7fba3b8444b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7fba3bb3ec0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7fb9ea3caa86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7fb9ea3cab14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7fb9ea3cacf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7fb9ea3cad59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7fb9ea3e760a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7fba391f925e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7fba391fa7ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7f45a7d39f50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4a) [0x7f45a7d3a79a]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7f4603fa1ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7f4603fa0b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7f4603fa14b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7f460429bc0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7f45b2b27a86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7f45b2b27b14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7f45b2b27cf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7f45b2b27d59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7f45b2b4460a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7f460195625e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7f46019577ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f46052e7c87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7f45a7d39f50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7f45a7d3a8fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7f4605304f10]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7) [0x7f4605304e87]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x141) [0x7f46053067f1]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x8c957) [0x7f4603f9b957]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x68) [0x7f45a7d3a7b8]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7f4603fa1ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7f4603fa0b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7f4603fa14b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7f460429bc0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7f45b2b27a86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7f45b2b27b14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7f45b2b27cf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7f45b2b27d59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7f45b2b4460a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7f460195625e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7f46019577ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7fa35c143f50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4a) [0x7fa35c14479a]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7fa3b83abae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7fa3b83aab49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7fa3b83ab4b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7fa3b86a5c0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7fa366f31a86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7fa366f31b14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7fa366f31cf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7fa366f31d59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7fa366f4e60a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7fa3b5d6025e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7fa3b5d617ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7fa3b96f1c87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7fa35c143f50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7fa35c1448fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7fa3b970ef10]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7) [0x7fa3b970ee87]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x141) [0x7fa3b97107f1]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x8c957) [0x7fa3b83a5957]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x68) [0x7fa35c1447b8]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7fa3b83abae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7fa3b83aab49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7fa3b83ab4b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7fa3b86a5c0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7fa366f31a86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7fa366f31b14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7fa366f31cf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7fa366f31d59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7fa366f4e60a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7fa3b5d6025e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7fa3b5d617ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7ff54cc8af50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4a) [0x7ff54cc8b79a]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7ff5a8ef2ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7ff5a8ef1b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7ff5a8ef24b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7ff5a91ecc0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7ff557a78a86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7ff557a78b14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7ff557a78cf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7ff557a78d59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7ff557a9560a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7ff5a68a725e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7ff5a68a87ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7ff5aa238c87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7ff54cc8af50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7ff54cc8b8fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7ff5aa255f10]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7) [0x7ff5aa255e87]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x141) [0x7ff5aa2577f1]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x8c957) [0x7ff5a8eec957]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x68) [0x7ff54cc8b7b8]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7ff5a8ef2ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7ff5a8ef1b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7ff5a8ef24b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7ff5a91ecc0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7ff557a78a86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7ff557a78b14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7ff557a78cf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7ff557a78d59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7ff557a9560a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7ff5a68a725e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7ff5a68a87ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7f81f5d3cf50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4a) [0x7f81f5d3d79a]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7f8251fa4ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7f8251fa3b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7f8251fa44b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7f825229ec0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7f8200b2aa86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7f8200b2ab14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7f8200b2acf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7f8200b2ad59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7f8200b4760a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7f824f95925e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7f824f95a7ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f82532eac87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7f81f5d3cf50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7f81f5d3d8fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7f8253307f10]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7) [0x7f8253307e87]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x141) [0x7f82533097f1]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x8c957) [0x7f8251f9e957]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x68) [0x7f81f5d3d7b8]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7f8251fa4ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7f8251fa3b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7f8251fa44b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7f825229ec0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7f8200b2aa86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7f8200b2ab14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7f8200b2acf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7f8200b2ad59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7f8200b4760a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7f824f95925e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7f824f95a7ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7f028935df50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4a) [0x7f028935e79a]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7f02e55c5ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7f02e55c4b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7f02e55c54b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7f02e58bfc0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7f029414ba86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7f029414bb14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7f029414bcf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7f029414bd59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7f029416860a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7f02e2f7a25e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7f02e2f7b7ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f02e690bc87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7f028935df50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7f028935e8fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7f02e6928f10]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7) [0x7f02e6928e87]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x141) [0x7f02e692a7f1]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x8c957) [0x7f02e55bf957]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x68) [0x7f028935e7b8]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7f02e55c5ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7f02e55c4b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7f02e55c54b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7f02e58bfc0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7f029414ba86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7f029414bb14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7f029414bcf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7f029414bd59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7f029416860a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7f02e2f7a25e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7f02e2f7b7ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7f7f917abf50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4a) [0x7f7f917ac79a]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7f7feda13ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7f7feda12b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7f7feda134b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7f7fedd0dc0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7f7f9c599a86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7f7f9c599b14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7f7f9c599cf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7f7f9c599d59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7f7f9c5b660a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7f7feb3c825e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7f7feb3c97ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7f7feed59c87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x50) [0x7f7f917abf50]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x5e) [0x7f7f917ac8fe]
/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7f7feed76f10]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xc7) [0x7f7feed76e87]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x141) [0x7f7feed787f1]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x8c957) [0x7f7feda0d957]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x68) [0x7f7f917ac7b8]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x92ae6) [0x7f7feda13ae6]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0x91b49) [0x7f7feda12b49]
/usr/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a8) [0x7f7feda134b8]
/usr/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x11d) [0x7f7fedd0dc0d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::hostBarrier()+0x3d6) [0x7f7f9c599a86]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x24) [0x7f7f9c599b14]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x17) [0x7f7f9c599cf7]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x9) [0x7f7f9c599d59]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x7860a) [0x7f7f9c5b660a]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22a25e) [0x7f7feb3c825e]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x22b7ae) [0x7f7feb3c97ae]
/usr/bin/python3.8() [0x5ae238]
/usr/bin/python3.8() [0x5b672c]
/usr/bin/python3.8() [0x65dfd9]
/usr/bin/python3.8() [0x5db6f8]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2a) [0x620e0a]
/usr/bin/python3.8(PyImport_Cleanup+0x2fe) [0x63086e]
/usr/bin/python3.8(Py_FinalizeEx+0x6e) [0x642d9e]
/usr/bin/python3.8(Py_RunMain+0xf9) [0x677c99]
/usr/bin/python3.8(Py_BytesMain+0x29) [0x678029]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xe7) [0x7fba3cb8ac87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
_libc_start_main+0xe7) [0x7f46052e7c87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
_libc_start_main+0xe7) [0x7fa3b96f1c87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
_libc_start_main+0xe7) [0x7ff5aa238c87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
_libc_start_main+0xe7) [0x7f82532eac87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
_libc_start_main+0xe7) [0x7f02e690bc87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
_libc_start_main+0xe7) [0x7f7feed59c87]
/usr/bin/python3.8(_start+0x2a) [0x5e1baa]
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node tye-ec2 exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
```.
@tengerye , right now what I can think of is that there might be incompatiable between your AMI instance software firmware & driver version and docker software version, the AMI I created:
ubuntu@ip-172-16-9-118:~$ hl-smi
+-----------------------------------------------------------------------------+
| HL-SMI Version: hl-1.8.0-fw-40.0.0.2 |
| Driver Version: 1.8.0-a9c2c49 |
|-------------------------------+----------------------+----------------------+
| AIP Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | AIP-Util Compute M. |
|===============================+======================+======================|
| 0 HL-205 N/A | 0000:20:1d.0 N/A | 0 |
| N/A 38C N/A 99W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 1 HL-205 N/A | 0000:90:1e.0 N/A | 0 |
| N/A 38C N/A 106W / 350W | 512MiB / 32768MiB | 3% N/A |
|-------------------------------+----------------------+----------------------+
| 2 HL-205 N/A | 0000:20:1e.0 N/A | 0 |
| N/A 36C N/A 89W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 3 HL-205 N/A | 0000:90:1d.0 N/A | 0 |
| N/A 36C N/A 99W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 4 HL-205 N/A | 0000:a0:1e.0 N/A | 0 |
| N/A 38C N/A 99W / 350W | 512MiB / 32768MiB | 1% N/A |
|-------------------------------+----------------------+----------------------+
| 5 HL-205 N/A | 0000:a0:1d.0 N/A | 0 |
| N/A 38C N/A 94W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 6 HL-205 N/A | 0000:10:1e.0 N/A | 0 |
| N/A 39C N/A 103W / 350W | 512MiB / 32768MiB | 2% N/A |
|-------------------------------+----------------------+----------------------+
| 7 HL-205 N/A | 0000:10:1d.0 N/A | 0 |
| N/A 37C N/A 99W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| Compute Processes: AIP Memory |
| AIP PID Type Process name Usage |
|=============================================================================|
| 0 N/A N/A N/A N/A |
| 1 N/A N/A N/A N/A |
| 2 N/A N/A N/A N/A |
| 3 N/A N/A N/A N/A |
| 4 N/A N/A N/A N/A |
| 5 N/A N/A N/A N/A |
| 6 N/A N/A N/A N/A |
| 7 N/A N/A N/A N/A |
+=============================================================================+
$dpkg -l | grep habana
dpkg -l | grep habana
ii habanalabs-container-runtime 1.8.0-690 amd64 Habana Labs container runtime. Provides a modified version of runc allowing users to run GPU enabled containers.
ii habanalabs-dkms 1.8.0-690 all habanalabs driver in DKMS format.
ii habanalabs-firmware 1.8.0-690 amd64 Firmware package for Habana Labs processing accelerators
ii habanalabs-firmware-tools 1.8.0-690 amd64 Habanalabs firmware tools package
ii habanalabs-graph 1.8.0-690 amd64 habanalabs graph compiler
ii habanalabs-thunk 1.8.0-690 all habanalabs thunk
please check yours.
here is what I launch mpirun on my AMI docker:
root@ip-172-16-9-118:/Model-References/PyTorch/examples/computer_vision/hello_world# mpirun -n 8 --bind-to core --map-by slot:PE=6 --rank-by core --report-bindings --allow-run-as-root python mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --hmp --hmp-bf16=ops_bf16_mnist.txt --hmp-fp32=ops_fp32_mnist.txt --use_lazy_mode
[ip-172-16-9-118:00642] MCW rank 4 bound to socket 1[core 24[hwt 0-1]], socket 1[core 25[hwt 0-1]], socket 1[core 26[hwt 0-1]], socket 1[core 27[hwt 0-1]], socket 1[core 28[hwt 0-1]], socket 1[core 29[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../../../../../../..][BB/BB/BB/BB/BB/BB/../../../../../../../../../../../../../../../../../..]
[ip-172-16-9-118:00642] MCW rank 5 bound to socket 1[core 30[hwt 0-1]], socket 1[core 31[hwt 0-1]], socket 1[core 32[hwt 0-1]], socket 1[core 33[hwt 0-1]], socket 1[core 34[hwt 0-1]], socket 1[core 35[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../../../../../../..][../../../../../../BB/BB/BB/BB/BB/BB/../../../../../../../../../../../..]
[ip-172-16-9-118:00642] MCW rank 6 bound to socket 1[core 36[hwt 0-1]], socket 1[core 37[hwt 0-1]], socket 1[core 38[hwt 0-1]], socket 1[core 39[hwt 0-1]], socket 1[core 40[hwt 0-1]], socket 1[core 41[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../../../../../../..][../../../../../../../../../../../../BB/BB/BB/BB/BB/BB/../../../../../..]
[ip-172-16-9-118:00642] MCW rank 7 bound to socket 1[core 42[hwt 0-1]], socket 1[core 43[hwt 0-1]], socket 1[core 44[hwt 0-1]], socket 1[core 45[hwt 0-1]], socket 1[core 46[hwt 0-1]], socket 1[core 47[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../../../../../../..][../../../../../../../../../../../../../../../../../../BB/BB/BB/BB/BB/BB]
[ip-172-16-9-118:00642] MCW rank 0 bound to socket 0[core 0[hwt 0-1]], socket 0[core 1[hwt 0-1]], socket 0[core 2[hwt 0-1]], socket 0[core 3[hwt 0-1]], socket 0[core 4[hwt 0-1]], socket 0[core 5[hwt 0-1]]: [BB/BB/BB/BB/BB/BB/../../../../../../../../../../../../../../../../../..][../../../../../../../../../../../../../../../../../../../../../../../..]
[ip-172-16-9-118:00642] MCW rank 1 bound to socket 0[core 6[hwt 0-1]], socket 0[core 7[hwt 0-1]], socket 0[core 8[hwt 0-1]], socket 0[core 9[hwt 0-1]], socket 0[core 10[hwt 0-1]], socket 0[core 11[hwt 0-1]]: [../../../../../../BB/BB/BB/BB/BB/BB/../../../../../../../../../../../..][../../../../../../../../../../../../../../../../../../../../../../../..]
[ip-172-16-9-118:00642] MCW rank 2 bound to socket 0[core 12[hwt 0-1]], socket 0[core 13[hwt 0-1]], socket 0[core 14[hwt 0-1]], socket 0[core 15[hwt 0-1]], socket 0[core 16[hwt 0-1]], socket 0[core 17[hwt 0-1]]: [../../../../../../../../../../../../BB/BB/BB/BB/BB/BB/../../../../../..][../../../../../../../../../../../../../../../../../../../../../../../..]
[ip-172-16-9-118:00642] MCW rank 3 bound to socket 0[core 18[hwt 0-1]], socket 0[core 19[hwt 0-1]], socket 0[core 20[hwt 0-1]], socket 0[core 21[hwt 0-1]], socket 0[core 22[hwt 0-1]], socket 0[core 23[hwt 0-1]]: [../../../../../../../../../../../../../../../../../../BB/BB/BB/BB/BB/BB][../../../../../../../../../../../../../../../../../../../../../../../..]
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
| distributed init (rank 0): env://
| distributed init (rank 1): env://
| distributed init (rank 2): env://
| distributed init (rank 3): env://
| distributed init (rank 4): env://
| distributed init (rank 5): env://
| distributed init (rank 6): env://
| distributed init (rank 7): env://
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784300912 KB
============================================================================================
Train Epoch: 1 [0/7500.0 (0%)] Loss: 2.306436
Train Epoch: 1 [640/7500.0 (9%)] Loss: 1.257539
Train Epoch: 1 [1280/7500.0 (17%)] Loss: 0.541899
Train Epoch: 1 [1920/7500.0 (26%)] Loss: 0.265077
Train Epoch: 1 [2560/7500.0 (34%)] Loss: 0.221311
Train Epoch: 1 [3200/7500.0 (43%)] Loss: 0.159253
Train Epoch: 1 [3840/7500.0 (51%)] Loss: 0.149002
Train Epoch: 1 [4480/7500.0 (60%)] Loss: 0.158327
Train Epoch: 1 [5120/7500.0 (68%)] Loss: 0.117364
Train Epoch: 1 [5760/7500.0 (77%)] Loss: 0.095949
Train Epoch: 1 [6400/7500.0 (85%)] Loss: 0.057692
Train Epoch: 1 [7040/7500.0 (94%)] Loss: 0.069655
Total test set: 10000, number of workers: 8
* Average Acc 97.574 Average loss 0.071
Hi @ytang-habana, I changed my OS to ubuntu20.04. It still went wrong. The following details what I did.
sudo modprobe -r habanalabs habanalabs_en
and sudo modprobe habanalabs habanalabs_en
.hl-smi
and got the following:
+-----------------------------------------------------------------------------+
| HL-SMI Version: hl-1.8.0-fw-40.0.0.2 |
| Driver Version: 1.8.0-a9c2c49 |
|-------------------------------+----------------------+----------------------+
| AIP Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | AIP-Util Compute M. |
|===============================+======================+======================|
| 0 HL-205 N/A | 0000:10:1d.0 N/A | 0 |
| N/A 26C N/A 100W / 350W | 512MiB / 32768MiB | 1% N/A |
|-------------------------------+----------------------+----------------------+
| 1 HL-205 N/A | 0000:90:1d.0 N/A | 0 |
| N/A 26C N/A 103W / 350W | 512MiB / 32768MiB | 2% N/A |
|-------------------------------+----------------------+----------------------+
| 2 HL-205 N/A | 0000:10:1e.0 N/A | 0 |
| N/A 25C N/A 93W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 3 HL-205 N/A | 0000:20:1d.0 N/A | 0 |
| N/A 26C N/A 98W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 4 HL-205 N/A | 0000:90:1e.0 N/A | 0 |
| N/A 25C N/A 93W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 5 HL-205 N/A | 0000:20:1e.0 N/A | 0 |
| N/A 25C N/A 97W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 6 HL-205 N/A | 0000:a0:1d.0 N/A | 0 |
| N/A 26C N/A 92W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| 7 HL-205 N/A | 0000:a0:1e.0 N/A | 0 |
| N/A 21C N/A 90W / 350W | 512MiB / 32768MiB | 0% N/A |
|-------------------------------+----------------------+----------------------+
| Compute Processes: AIP Memory |
| AIP PID Type Process name Usage |
|=============================================================================|
| 0 N/A N/A N/A N/A |
| 1 N/A N/A N/A N/A |
| 2 N/A N/A N/A N/A |
| 3 N/A N/A N/A N/A |
| 4 N/A N/A N/A N/A |
| 5 N/A N/A N/A N/A |
| 6 N/A N/A N/A N/A |
| 7 N/A N/A N/A N/A |
+=============================================================================+
tye@tye-ec2:~$ hl-smi -n link -i 0000:10:1d.0
port 0: UP
port 2: UP
port 3: UP
port 4: UP
port 5: UP
port 6: UP
port 7: UP
tye@tye-ec2:~$ hl-smi -n link -i 0000:90:1d.0
port 0: UP
port 2: UP
port 3: UP
port 4: UP
port 5: UP
port 6: UP
port 7: UP
tye@tye-ec2:~$ hl-smi -n link -i 0000:10:1e.0
port 0: UP
port 2: UP
port 3: UP
port 4: UP
port 5: UP
port 6: UP
port 7: UP
tye@tye-ec2:~$ hl-smi -n link -i 0000:20:1d.0
port 0: UP
port 2: UP
port 3: UP
port 4: UP
port 5: UP
port 6: UP
port 7: UP
tye@tye-ec2:~$ hl-smi -n link -i 0000:90:1e.0
port 0: UP
port 2: UP
port 3: UP
port 4: UP
port 5: UP
port 6: UP
port 7: UP
tye@tye-ec2:~$ hl-smi -n link -i 0000:20:1e.0
port 0: UP
port 2: UP
port 3: UP
port 4: UP
port 5: UP
port 6: UP
port 7: UP
tye@tye-ec2:~$ hl-smi -n link -i 0000:a0:1d.0
port 0: UP
port 2: UP
port 3: UP
port 4: UP
port 5: UP
port 6: UP
port 7: UP
tye@tye-ec2:~$ hl-smi -n link -i 0000:a0:1e.0
port 0: UP
port 2: UP
port 3: UP
port 4: UP
port 5: UP
port 6: UP
port 7: UP
docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host --privileged vault.habana.ai/gaudi-docker/1.8.0/ubuntu20.04/habanalabs/pytorch-installer-1.13.1:latest
.git clone https://github.com/HabanaAI/Model-References.git
, cd Model-References/PyTorch/examples/computer_vision/hello_world/
, export PYTHONPATH=$PYTHONPATH:Model-References
, and export PYTHON=/usr/bin/python3.8
.$PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 \
--gamma=0.7 --hpu --hmp \
--hmp-bf16=ops_bf16_mnist.txt \
--hmp-fp32=ops_fp32_mnist.txt \
--use_lazy_mode
and everything went well.
mpirun -n 8 --bind-to core --map-by slot:PE=6 \
--rank-by core --report-bindings \
--allow-run-as-root \
$PYTHON mnist.py \
--batch-size=64 --epochs=1 \
--lr=1.0 --gamma=0.7 \
--hpu --hmp --hmp-bf16=ops_bf16_mnist.txt \
--hmp-fp32=ops_fp32_mnist.txt \
--use_lazy_mode
and the following is the output:
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
hmp:verbose_mode False
hmp:opt_level O1
| distributed init (rank 1): env://
| distributed init (rank 2): env://
| distributed init (rank 3): env://
| distributed init (rank 4): env://
| distributed init (rank 5): env://
| distributed init (rank 6): env://
| distributed init (rank 7): env://
| distributed init (rank 0): env://
=============================HABANA PT BRIDGE CONFIGURATION ===========================
PT_HPU_LAZY_MODE = 1
PT_HPU_LAZY_EAGER_OPTIM_CACHE = 1
PT_HPU_ENABLE_COMPILE_THREAD = 0
PT_HPU_ENABLE_EXECUTION_THREAD = 1
PT_HPU_ENABLE_LAZY_EAGER_EXECUTION_THREAD = 1
PT_ENABLE_INTER_HOST_CACHING = 0
PT_ENABLE_INFERENCE_MODE = 1
PT_ENABLE_HABANA_CACHING = 1
PT_HPU_MAX_RECIPE_SUBMISSION_LIMIT = 0
PT_HPU_MAX_COMPOUND_OP_SIZE = 9223372036854775807
PT_HPU_MAX_COMPOUND_OP_SIZE_SS = 10
PT_HPU_ENABLE_STAGE_SUBMISSION = 1
PT_HPU_STAGE_SUBMISSION_MODE = 2
PT_HPU_PGM_ENABLE_CACHE = 1
PT_HPU_ENABLE_LAZY_COLLECTIVES = 0
PT_HCCL_SLICE_SIZE_MB = 16
PT_HCCL_MEMORY_ALLOWANCE_MB = 0
PT_HPU_INITIAL_WORKSPACE_SIZE = 0
PT_HABANA_POOL_SIZE = 24
PT_HPU_POOL_STRATEGY = 5
PT_HPU_POOL_LOG_FRAGMENTATION_INFO = 0
PT_ENABLE_MEMORY_DEFRAGMENTATION = 1
PT_ENABLE_DEFRAGMENTATION_INFO = 0
PT_HPU_ENABLE_SYNAPSE_LAYOUT_HANDLING = 1
PT_HPU_ENABLE_SYNAPSE_OUTPUT_PERMUTE = 1
PT_HPU_ENABLE_VALID_DATA_RANGE_CHECK = 1
PT_HPU_FORCE_USE_DEFAULT_STREAM = 0
PT_RECIPE_CACHE_PATH =
PT_HPU_ENABLE_REFINE_DYNAMIC_SHAPES = 0
PT_HPU_DYNAMIC_MIN_POLICY_ORDER = 4,5,3,1
PT_HPU_DYNAMIC_MAX_POLICY_ORDER = 2,4,5,3,1
PT_HPU_LAZY_ACC_PAR_MODE = 1
=============================SYSTEM CONFIGURATION =========================================
Num CPU Cores = 96
CPU RAM = 784300912 KB
============================================================================================
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f068554e204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x62) [0x7f068554eef2]
/lib/x86_64-linux-gnu/libc.so.6(+0x43090) [0x7f06d6c17090]
/usr/lib/habanalabs/libhcl.so(hccl::detect_tcp_ifs(std::vector<hccl::detected_tcp_if, std::allocator<hccl::detected_tcp_if> >&)+0x469) [0x7f0684398ac9]
/usr/lib/habanalabs/libhcl.so(hccl::detect_tcp_if()+0x3d) [0x7f06843990ed]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_coordinator::create(bool)+0x8e) [0x7f068435898e]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_context::add_new_coordinator(bool)+0x3a) [0x7f068434591a]
/usr/lib/habanalabs/libhcl.so(hccl::hccl_context::get_unique_id(hcclUniqueId*)+0x26c) [0x7f068434665c]
/usr/lib/habanalabs/libhcl.so(hcclGetUniqueId_Original(hcclUniqueId*)+0x37) [0x7f06842f2a77]
/usr/lib/habanalabs/libhcl.so(hcclGetUniqueId_impl(hcclUniqueId*)+0x3d) [0x7f06842f422d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/lib/libpytorch_synapse_helpers.so(hcclGetUniqueId+0x37) [0x7f0689110e87]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::getComm(int)+0x124) [0x7f0688d70804]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::getCommList(std::vector<int, std::allocator<int> > const&)+0x112) [0x7f0688d70e32]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::allgather(std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > >&, std::vector<at::Tensor, std::allocator<at::Tensor> >&, c10d::AllgatherOptions const&)+0x74a) [0x7f0688d7c1ba]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(+0x4f8ff19) [0x7f06cebf4f19]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(+0x4f95a12) [0x7f06cebfaa12]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(c10d::ops::allgather(c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, std::vector<std::vector<at::Tensor, std::allocator<at::Tensor> >, std::allocator<std::vector<at::Tensor, std::allocator<at::Tensor> > > > const&, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, c10d::AllgatherOptions const&)+0x157) [0x7f06cebf8247]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_cpu.so(c10d::verify_params_across_processes(c10::intrusive_ptr<c10d::ProcessGroup, c10::detail::intrusive_target_default_null_type<c10d::ProcessGroup> > const&, std::vector<at::Tensor, std::allocator<at::Tensor> > const&, c10::optional<std::weak_ptr<c10d::Logger> > const&)+0x331) [0x7f06cec5b761]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0xaff33d) [0x7f06d5bc433d]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a29e5) [0x7f06d54679e5]
/usr/bin/python3.8(PyCFunction_Call+0x59) [0x5f5b39]
/usr/bin/python3.8(_PyObject_MakeTpCall+0x296) [0x5f6706]
/usr/bin/python3.8(_PyEval_EvalFrameDefault+0x5db3) [0x571143]
/usr/bin/python3.8(_PyEval_EvalCodeWithName+0x26a) [0x569d8a]
/usr/bin/python3.8(_PyFunction_Vectorcall+0x393) [0x5f60c3]
/usr/bin/python3.8(_PyEval_EvalFrameDefault+0x726) [0x56bab6]
/usr/bin/python3.8(_PyEval_EvalCodeWithName+0x26a) [0x569d8a]
/usr/bin/python3.8(_PyFunction_Vectorcall+0x393) [0x5f60c3]
/usr/bin/python3.8() [0x59c130]
/usr/bin/python3.8(_PyObject_MakeTpCall+0x1ff) [0x5f666f]
Internal Error: Received signal - Segmentation fault
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
Traceback (most recent call last):
File "mnist.py", line 226, in <module>
main()
File "mnist.py", line 204, in main
model = torch.nn.parallel.DistributedDataParallel(model, bucket_cap_mb=100, broadcast_buffers=False,
File "/usr/local/lib/python3.8/dist-packages/torch/nn/parallel/distributed.py", line 655, in __init__
_verify_param_shape_across_processes(self.process_group, parameters)
File "/usr/local/lib/python3.8/dist-packages/torch/distributed/utils.py", line 112, in _verify_param_shape_across_processes
return dist._verify_params_across_processes(process_group, tensors, logger)
RuntimeError: Connection reset by peer
terminate called after throwing an instance of 'std::system_error'
what(): Broken pipe
Internal Error: Received signal - Aborted
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f102fe42204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4e) [0x7f102fe42d8e]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7f10811aa38c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7f10811a9369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7f10811a9d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7f10812e5f89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7f103365a833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7f103366714a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7f10336673bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7f103366741d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7f103367db31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7f107fd591e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7f107fd5a103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f10814ec083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f102fe42204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x62) [0x7f102fe42ef2]
/lib/x86_64-linux-gnu/libc.so.6(+0x43090) [0x7f108150b090]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb) [0x7f108150b00b]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x12b) [0x7f10814ea859]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e911) [0x7f108119e911]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x6c) [0x7f102fe42dac]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7f10811aa38c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7f10811a9369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7f10811a9d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7f10812e5f89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7f103365a833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7f103366714a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7f10336673bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7f103366741d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7f103367db31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7f107fd591e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7f107fd5a103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f6bbaad8204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4e) [0x7f6bbaad8d8e]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7f6c0be4038c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7f6c0be3f369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7f6c0be3fd21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7f6c0bf7bf89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7f6bbe2f0833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7f6bbe2fd14a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7f6bbe2fd3bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7f6bbe2fd41d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7f6bbe313b31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7f6c0a9ef1e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7f6c0a9f0103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f6c0c182083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f6bbaad8204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x62) [0x7f6bbaad8ef2]
/lib/x86_64-linux-gnu/libc.so.6(+0x43090) [0x7f6c0c1a1090]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb) [0x7f6c0c1a100b]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x12b) [0x7f6c0c180859]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e911) [0x7f6c0be34911]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x6c) [0x7f6bbaad8dac]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7f6c0be4038c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7f6c0be3f369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7f6c0be3fd21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7f6c0bf7bf89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7f6bbe2f0833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7f6bbe2fd14a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7f6bbe2fd3bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7f6bbe2fd41d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7f6bbe313b31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7f6c0a9ef1e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7f6c0a9f0103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f26926ff204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4e) [0x7f26926ffd8e]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7f26e3a6738c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7f26e3a66369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7f26e3a66d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7f26e3ba2f89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7f2695f17833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7f2695f2414a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7f2695f243bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7f2695f2441d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7f2695f3ab31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7f26e26161e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7f26e2617103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f26e3da9083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f26926ff204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x62) [0x7f26926ffef2]
/lib/x86_64-linux-gnu/libc.so.6(+0x43090) [0x7f26e3dc8090]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb) [0x7f26e3dc800b]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x12b) [0x7f26e3da7859]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e911) [0x7f26e3a5b911]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x6c) [0x7f26926ffdac]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7f26e3a6738c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7f26e3a66369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7f26e3a66d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7f26e3ba2f89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7f2695f17833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7f2695f2414a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7f2695f243bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7f2695f2441d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7f2695f3ab31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7f26e26161e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7f26e2617103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7fd815ef7204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4e) [0x7fd815ef7d8e]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7fd86725f38c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7fd86725e369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7fd86725ed21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7fd86739af89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7fd81970f833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7fd81971c14a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7fd81971c3bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7fd81971c41d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7fd819732b31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7fd865e0e1e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7fd865e0f103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7fd8675a1083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7fd815ef7204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x62) [0x7fd815ef7ef2]
/lib/x86_64-linux-gnu/libc.so.6(+0x43090) [0x7fd8675c0090]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb) [0x7fd8675c000b]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x12b) [0x7fd86759f859]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e911) [0x7fd867253911]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x6c) [0x7fd815ef7dac]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7fd86725f38c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7fd86725e369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7fd86725ed21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7fd86739af89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7fd81970f833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7fd81971c14a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7fd81971c3bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7fd81971c41d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7fd819732b31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7fd865e0e1e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7fd865e0f103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7febbfd22204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4e) [0x7febbfd22d8e]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7fec1108a38c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7fec11089369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7fec11089d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7fec111c5f89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7febc353a833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7febc354714a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7febc35473bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7febc354741d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7febc355db31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7fec0fc391e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7fec0fc3a103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7fec113cc083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7febbfd22204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x62) [0x7febbfd22ef2]
/lib/x86_64-linux-gnu/libc.so.6(+0x43090) [0x7fec113eb090]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb) [0x7fec113eb00b]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x12b) [0x7fec113ca859]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e911) [0x7fec1107e911]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x6c) [0x7febbfd22dac]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7fec1108a38c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7fec11089369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7fec11089d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7fec111c5f89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7febc353a833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7febc354714a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7febc35473bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7febc354741d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7febc355db31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7fec0fc391e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7fec0fc3a103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7fce9643a204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4e) [0x7fce9643ad8e]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7fcee77a238c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7fcee77a1369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7fcee77a1d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7fcee78ddf89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7fce99c52833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7fce99c5f14a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7fce99c5f3bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7fce99c5f41d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7fce99c75b31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7fcee63511e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7fcee6352103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7fcee7ae4083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7fce9643a204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x62) [0x7fce9643aef2]
/lib/x86_64-linux-gnu/libc.so.6(+0x43090) [0x7fcee7b03090]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb) [0x7fcee7b0300b]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x12b) [0x7fcee7ae2859]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e911) [0x7fcee7796911]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x6c) [0x7fce9643adac]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7fcee77a238c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7fcee77a1369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7fcee77a1d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7fcee78ddf89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7fce99c52833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7fce99c5f14a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7fce99c5f3bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7fce99c5f41d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7fce99c75b31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7fcee63511e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7fcee6352103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f3a6e962204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x4e) [0x7f3a6e962d8e]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7f3abfcca38c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7f3abfcc9369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7f3abfcc9d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7f3abfe05f89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7f3a7217a833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7f3a7218714a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7f3a721873bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7f3a7218741d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7f3a7219db31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7f3abe8791e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7f3abe87a103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7f3ac000c083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::logStackTrace()+0x54) [0x7f3a6e962204]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::signalHandler(int, siginfo_t*, void*)+0x62) [0x7f3a6e962ef2]
/lib/x86_64-linux-gnu/libc.so.6(+0x43090) [0x7f3ac002b090]
/lib/x86_64-linux-gnu/libc.so.6(gsignal+0xcb) [0x7f3ac002b00b]
/lib/x86_64-linux-gnu/libc.so.6(abort+0x12b) [0x7f3ac000a859]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0x9e911) [0x7f3abfcbe911]
/usr/lib/habanalabs/libSynapse.so(synapse::LogManager::terminateHandler()+0x6c) [0x7f3a6e962dac]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xaa38c) [0x7f3abfcca38c]
/lib/x86_64-linux-gnu/libstdc++.so.6(+0xa9369) [0x7f3abfcc9369]
/lib/x86_64-linux-gnu/libstdc++.so.6(__gxx_personality_v0+0x2a1) [0x7f3abfcc9d21]
/lib/x86_64-linux-gnu/libunwind.so.8(__libunwind_Unwind_Resume+0x129) [0x7f3abfe05f89]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x33833) [0x7f3a7217a833]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::destroy()+0x2a) [0x7f3a7218714a]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0x1b) [0x7f3a721873bb]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(c10d::ProcessGroupHCCL::~ProcessGroupHCCL()+0xd) [0x7f3a7218741d]
/usr/local/lib/python3.8/dist-packages/habana_frameworks/torch/distributed/_hccl_C.so(+0x56b31) [0x7f3a7219db31]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a01e3) [0x7f3abe8791e3]
/usr/local/lib/python3.8/dist-packages/torch/lib/libtorch_python.so(+0x3a1103) [0x7f3abe87a103]
/usr/bin/python3.8() [0x5ce8a8]
/usr/bin/python3.8() [0x5d176c]
/usr/bin/python3.8(PyDict_Clear+0xeb) [0x5ce9eb]
/usr/bin/python3.8() [0x6aa83a]
/usr/bin/python3.8() [0x4effff]
/usr/bin/python3.8(_PyGC_CollectNoFail+0x2f) [0x66ff0f]
/usr/bin/python3.8(PyImport_Cleanup+0x314) [0x685dd4]
/usr/bin/python3.8(Py_FinalizeEx+0x7f) [0x680b4f]
/usr/bin/python3.8(Py_RunMain+0x32d) [0x6b819d]
/usr/bin/python3.8(Py_BytesMain+0x2d) [0x6b840d]
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3) [0x7fec113cc083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
start_main+0xf3) [0x7fcee7ae4083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
start_main+0xf3) [0x7f10814ec083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
start_main+0xf3) [0x7f3ac000c083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
start_main+0xf3) [0x7f6c0c182083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
start_main+0xf3) [0x7f26e3da9083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
start_main+0xf3) [0x7fd8675a1083]
/usr/bin/python3.8(_start+0x2e) [0x5faa2e]
--------------------------------------------------------------------------
mpirun noticed that process rank 0 with PID 0 on node tye-ec2 exited on signal 11 (Segmentation fault).
--------------------------------------------------------------------------
@tengerye, From the last message you provided, the root cause is not in Lightning-AI because you can reproduce it on PyTorch mpirun too. I suggest we close this issue as a Lightning-AI issue. However I can't reproduce any of them on my side, we need to look into it from hardware and software configuration side on your AMI. By the way, can you please terminate this AMI, create a new one and try again?
Hi @ytang-habana, this is a new one with ubuntu20.04 already. The original one is ubuntu18.04.
Bug description
I ran the code well on single HPU, however it went wrong when I used more than one HPUs.
How to reproduce the bug
Error messages and logs
Environment
I am using HPU on AWS
``` #- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow): Trainer, LightningModule #- PyTorch Lightning Version (e.g., 1.5.0): 1.8.6 #- Lightning App Version (e.g., 0.5.2): #- PyTorch Version (e.g., 2.0): 1.13.1a0+git0b11ee5 #- Python version (e.g., 3.9): 3.8.16 #- OS (e.g., Linux): Linux, 68~18.04.1-Ubuntu SMP #- CUDA/cuDNN version: N/A #- GPU models and configuration: N/A #- How you installed Lightning(`conda`, `pip`, source): pip #- Running environment of LightningApp (e.g. local, cloud): N/A ```More info
cc @jerome-habana