dusty-nv / jetson-containers

Machine Learning Containers for NVIDIA Jetson and JetPack-L4T
MIT License
2.13k stars 440 forks source link

Nemo docker:AttributeError: module 'torch.distributed' has no attribute 'is_initialized' #315

Open zhanghui-china opened 10 months ago

zhanghui-china commented 10 months ago

docker image name: dustynv/nemo:r35.3.1

device: Jetson AGX Orin 32G

step:

import nemo import nemo.collections.asr as nemo_asr import nemo.collections.tts as nemo_tts import chinese2digits as c2d image

citrinet = nemo_asr.models.EncDecCTCModel.restore_from("stt_zh_citrinet_512.nemo") image image

asr_result = citrinet.transcribe(paths2audio_files=["test.wav"]) asr_result = " ".join(asr_result) asr_result = c2d.takeChineseNumberFromString(asr_result)['replacedText'] print(asr_result) image image

`--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[29], line 1 ----> 1 asr_result = citrinet.transcribe(paths2audio_files=["test.wav"]) 2 asr_result = " ".join(asr_result) 3 asr_result = c2d.takeChineseNumberFromString(asr_result)['replacedText']

File /usr/local/lib/python3.8/dist-packages/torch/utils/_contextlib.py:115, in context_decorator..decorate_context(*args, kwargs) 112 @functools.wraps(func) 113 def decorate_context(*args, *kwargs): 114 with ctx_factory(): --> 115 return func(args, kwargs)

File /usr/local/lib/python3.8/dist-packages/nemo/collections/asr/models/ctc_models.py:198, in EncDecCTCModel.transcribe(self, paths2audio_files, batch_size, logprobs, return_hypotheses, num_workers, channel_selector, augmentor, verbose) 196 temporary_datalayer = self._setup_transcribe_dataloader(config) 197 for test_batch in tqdm(temporary_datalayer, desc="Transcribing", disable=not verbose): --> 198 logits, logits_len, greedy_predictions = self.forward( 199 input_signal=test_batch[0].to(device), input_signal_length=test_batch[1].to(device) 200 ) 202 if logprobs: 203 # dump log probs per file 204 for idx in range(logits.shape[0]):

File /usr/local/lib/python3.8/dist-packages/nemo/core/classes/common.py:1087, in typecheck.call(self, wrapped, instance, args, kwargs) 1084 instance._validate_input_types(input_types=input_types, ignore_collections=self.ignore_collections, *kwargs) 1086 # Call the method - this can be forward, or any other callable method -> 1087 outputs = wrapped(args, **kwargs) 1089 instance._attach_and_validate_output_types( 1090 output_types=output_types, ignore_collections=self.ignore_collections, out_objects=outputs 1091 ) 1093 return outputs

File /usr/local/lib/python3.8/dist-packages/nemo/collections/asr/models/ctc_models.py:543, in EncDecCTCModel.forward(self, input_signal, input_signal_length, processed_signal, processed_signal_length) 540 if self.spec_augmentation is not None and self.training: 541 processed_signal = self.spec_augmentation(input_spec=processed_signal, length=processed_signal_length) --> 543 encoder_output = self.encoder(audio_signal=processed_signal, length=processed_signal_length) 544 encoded = encoder_output[0] 545 encoded_len = encoder_output[1]

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, *kwargs) 1496 # If we don't have any hooks, we want to skip the rest of the logic in 1497 # this function, and just call forward. 1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1499 or _global_backward_pre_hooks or _global_backward_hooks 1500 or _global_forward_hooks or _global_forward_pre_hooks): -> 1501 return forward_call(args, **kwargs) 1502 # Do not call functions when jit is used 1503 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/nemo/core/classes/common.py:1087, in typecheck.call(self, wrapped, instance, args, kwargs) 1084 instance._validate_input_types(input_types=input_types, ignore_collections=self.ignore_collections, *kwargs) 1086 # Call the method - this can be forward, or any other callable method -> 1087 outputs = wrapped(args, **kwargs) 1089 instance._attach_and_validate_output_types( 1090 output_types=output_types, ignore_collections=self.ignore_collections, out_objects=outputs 1091 ) 1093 return outputs

File /usr/local/lib/python3.8/dist-packages/nemo/collections/asr/modules/conv_asr.py:196, in ConvASREncoder.forward(self, audio_signal, length) 194 @typecheck() 195 def forward(self, audio_signal, length): --> 196 self.update_max_sequence_length(seq_length=audio_signal.size(2), device=audio_signal.device) 197 s_input, length = self.encoder(([audio_signal], length)) 198 if length is None:

File /usr/local/lib/python3.8/dist-packages/nemo/collections/asr/modules/conv_asr.py:205, in ConvASREncoder.update_max_sequence_length(self, seq_length, device) 203 def update_max_sequence_length(self, seq_length: int, device): 204 # Find global max audio length across all nodes --> 205 if torch.distributed.is_initialized(): 206 global_max_len = torch.tensor([seq_length], dtype=torch.float32, device=device) 208 # Update across all ranks in the distributed system

AttributeError: module 'torch.distributed' has no attribute 'is_initialized'`

dusty-nv commented 10 months ago

AttributeError: module 'torch.distributed' has no attribute 'is_initialized'`

Hi @zhanghui-china , the PyTorch wheels for Jetson aren't built with USE_DISTRIBUTED enabled - it probably needs another sed command to remove that call to torch.distributed.is_initialized() like here:

https://github.com/dusty-nv/jetson-containers/blob/3f4db21d6d938070827b7e258aa855e6b1a390c7/packages/nemo/Dockerfile#L71