Open terryops opened 9 months ago
[NeMo I 2023-10-17 17:09:36 speaker_utils:93] Number of files to diarize: 1 [NeMo I 2023-10-17 17:09:36 clustering_diarizer:309] Split long audio file to avoid CUDA memory issue splitting manifest: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00, 2.26s/it] [NeMo I 2023-10-17 17:09:39 vad_utils:107] The prepared manifest file exists. Overwriting! [NeMo I 2023-10-17 17:09:39 classification_models:272] Perform streaming frame-level VAD [NeMo I 2023-10-17 17:09:39 collections:301] Filtered duration for loading collection is 0.00 hours. [NeMo I 2023-10-17 17:09:39 collections:302] Dataset loaded with 71 items, total duration of 0.98 hours. [NeMo I 2023-10-17 17:09:39 collections:304] # 71 files loaded accounting to # 1 labels vad: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 71/71 [00:10<00:00, 6.99it/s] [NeMo I 2023-10-17 17:09:49 clustering_diarizer:250] Generating predictions with overlapping input segments [NeMo I 2023-10-17 17:10:12 clustering_diarizer:262] Converting frame level prediction to speech/no-speech segment in start and end times format. creating speech segments: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00, 2.67s/it] ╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /mnt/bigdisk/whisper-diarization/nemo_process.py:29 in <module> │ │ │ │ 26 │ │ 27 # Initialize NeMo MSDD diarization model │ │ 28 msdd_model = NeuralDiarizer(cfg=create_config(temp_path)).to(args.device) │ │ ❱ 29 msdd_model.diarize() │ │ 30 │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/torch/utils/_contextlib.py:115 in │ │ decorate_context │ │ │ │ 112 │ @functools.wraps(func) │ │ 113 │ def decorate_context(*args, **kwargs): │ │ 114 │ │ with ctx_factory(): │ │ ❱ 115 │ │ │ return func(*args, **kwargs) │ │ 116 │ │ │ 117 │ return decorate_context │ │ 118 │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/msdd_models.py: │ │ 1180 in diarize │ │ │ │ 1177 │ │ Note that the result of MSDD can include multiple speakers at the same time. The │ │ 1178 │ │ function that can generate overlapping timestamps. `self.run_overlap_aware_eval( │ │ 1179 │ │ """ │ │ ❱ 1180 │ │ self.clustering_embedding.prepare_cluster_embs_infer() │ │ 1181 │ │ self.msdd_model.pairwise_infer = True │ │ 1182 │ │ self.get_emb_clus_infer(self.clustering_embedding) │ │ 1183 │ │ preds_list, targets_list, signal_lengths_list = self.run_pairwise_diarization() │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/msdd_models.py: │ │ 699 in prepare_cluster_embs_infer │ │ │ │ 696 │ │ Launch clustering diarizer to prepare embedding vectors and clustering results. │ │ 697 │ │ """ │ │ 698 │ │ self.max_num_speakers = self.cfg_diar_infer.diarizer.clustering.parameters.max_n │ │ ❱ 699 │ │ self.emb_sess_test_dict, self.emb_seq_test, self.clus_test_label_dict, _ = self. │ │ 700 │ │ │ self._cfg_msdd.test_ds.manifest_filepath, self._cfg_msdd.test_ds.emb_dir │ │ 701 │ │ ) │ │ 702 │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/msdd_models.py: │ │ 866 in run_clustering_diarizer │ │ │ │ 863 │ │ │ │ 864 │ │ logging.info(f"Multiscale Weights: {self.clus_diar_model.multiscale_args_dict['m │ │ 865 │ │ logging.info(f"Clustering Parameters: {clustering_params_str}") │ │ ❱ 866 │ │ scores = self.clus_diar_model.diarize(batch_size=self.cfg_diar_infer.batch_size) │ │ 867 │ │ │ │ 868 │ │ # If RTTM (ground-truth diarization annotation) files do not exist, scores is No │ │ 869 │ │ if scores is not None: │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/clustering_diar │ │ izer.py:437 in diarize │ │ │ │ 434 │ │ os.makedirs(out_rttm_dir, exist_ok=True) │ │ 435 │ │ │ │ 436 │ │ # Speech Activity Detection │ │ ❱ 437 │ │ self._perform_speech_activity_detection() │ │ 438 │ │ │ │ 439 │ │ # Segmentation │ │ 440 │ │ scales = self.multiscale_args_dict['scale_dict'].items() │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/clustering_diar │ │ izer.py:325 in _perform_speech_activity_detection │ │ │ │ 322 │ │ │ │ ) │ │ 323 │ │ │ │ │ 324 │ │ │ self._setup_vad_test_data(manifest_vad_input) │ │ ❱ 325 │ │ │ self._run_vad(manifest_vad_input) │ │ 326 │ │ │ │ 327 │ │ elif self._diarizer_params.vad.external_vad_manifest is not None: │ │ 328 │ │ │ self._speaker_manifest_path = self._diarizer_params.vad.external_vad_manifes │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/models/clustering_diar │ │ izer.py:281 in _run_vad │ │ │ │ 278 │ │ │ else: │ │ 279 │ │ │ │ logging.warning(f"no vad file found for {key} due to zero or negative du │ │ 280 │ │ │ │ ❱ 281 │ │ write_rttm2manifest(AUDIO_VAD_RTTM_MAP, self._vad_out_file) │ │ 282 │ │ self._speaker_manifest_path = self._vad_out_file │ │ 283 │ │ │ 284 │ def _run_segmentation(self, window: float, shift: float, scale_tag: str = ''): │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/speaker_ut │ │ ils.py:858 in write_rttm2manifest │ │ │ │ 855 │ │ for uniq_id in AUDIO_RTTM_MAP: │ │ 856 │ │ │ rttm_file_path = AUDIO_RTTM_MAP[uniq_id]['rttm_filepath'] │ │ 857 │ │ │ rttm_lines = read_rttm_lines(rttm_file_path) │ │ ❱ 858 │ │ │ offset, duration = get_offset_and_duration(AUDIO_RTTM_MAP, uniq_id, decimals │ │ 859 │ │ │ vad_start_end_list_raw = [] │ │ 860 │ │ │ for line in rttm_lines: │ │ 861 │ │ │ │ start, dur = get_vad_out_from_rttm_line(line) │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/nemo/collections/asr/parts/utils/speaker_ut │ │ ils.py:565 in get_offset_and_duration │ │ │ │ 562 │ │ offset = round(AUDIO_RTTM_MAP[uniq_id]['offset'], decimals) │ │ 563 │ else: │ │ 564 │ │ sound = sf.SoundFile(audio_path) │ │ ❱ 565 │ │ duration = sound.frames / sound.samplerate │ │ 566 │ │ offset = 0.0 │ │ 567 │ return offset, duration │ │ 568 │ │ │ │ /mnt/bigdisk/miniconda3/lib/python3.10/site-packages/soundfile.py:822 in __getattr__ │ │ │ │ 819 │ │ │ data = _snd.sf_get_string(self._file, _str_types[name]) │ │ 820 │ │ │ return _ffi.string(data).decode('utf-8', 'replace') if data else "" │ │ 821 │ │ else: │ │ ❱ 822 │ │ │ raise AttributeError( │ │ 823 │ │ │ │ "'SoundFile' object has no attribute {0!r}".format(name)) │ │ 824 │ │ │ 825 │ def __len__(self): │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ AttributeError: 'SoundFile' object has no attribute 'frames'```
Please upload the audio file using any method to reproduce the isdue
https://drive.google.com/file/d/1fyxn2N2sfnP3ZEhU8_xO8NS9E9roG2Rz/view?usp=share_link
I've uploaded the file already, would you please take a look?