espnet / espnet

End-to-End Speech Processing Toolkit
https://espnet.github.io/espnet/
Apache License 2.0
8.43k stars 2.18k forks source link

espnet2 ctc segmentation #3232

Closed abnerLing closed 2 years ago

abnerLing commented 3 years ago

I've tried testing out the CTC segmentation example with ESPnet2 and I get the same error of "No such file or directory: 'data/nlsyms.txt'." This occurs when using the below script and also when running the exact same python code in the example. espnet2/bin/asr_align.py --asr_train_config config.yaml --asr_model_file model.pth --audio audio.wav --text text.txt --output segments

Traceback (most recent call last): File "espnet2/bin/asr_align.py", line 827, in <module> main() File "espnet2/bin/asr_align.py", line 823, in main ctc_align(**kwargs) File "espnet2/bin/asr_align.py", line 632, in ctc_align aligner = CTCSegmentation(**model, **kwargs) File "espnet2/bin/asr_align.py", line 239, in __init__ self.preprocess_fn = ASRTask.build_preprocess_fn(asr_train_args, False) File "/home/abner/work/espnet/espnet2/tasks/asr.py", line 293, in build_preprocess_fn retval = CommonPreprocessor( File "/home/abner/work/espnet/espnet2/train/preprocessor.py", line 161, in __init__ self.tokenizer = build_tokenizer( File "/home/abner/work/espnet/espnet2/text/build_tokenizer.py", line 46, in build_tokenizer return CharTokenizer( File "/home/abner/work/espnet/espnet2/text/char_tokenizer.py", line 24, in __init__ with non_linguistic_symbols.open("r", encoding="utf-8") as f: File "/home/abner/anaconda3/envs/es/lib/python3.8/pathlib.py", line 1222, in open return io.open(self, mode, buffering, encoding, errors, newline, File "/home/abner/anaconda3/envs/es/lib/python3.8/pathlib.py", line 1078, in _opener return self._accessor.open(self, flags, mode) FileNotFoundError: [Errno 2] No such file or directory: 'data/nlsyms.txt'

Basic environments:

lumaku commented 3 years ago

I can confirm this issue. It happens on the current master (but did not occur in #3087 ?)

It seems that the text cleaner tries to load non_linguistic_symbols from a file that doesn't exist. In config.yaml:

non_linguistic_symbols: data/nlsyms.txt

Then the code crashes here: https://github.com/espnet/espnet/blob/fe551ff9370dbb95507d74701c139743b180fa59/espnet2/text/char_tokenizer.py#L20-L27

To reproduce, with a freshly set-up espnet:


from espnet_model_zoo.downloader import ModelDownloader
d = ModelDownloader(cachedir="./modelcache")
wsjmodel = d.download_and_unpack("kamo-naoyuki/wsj")
# load the example file included in the ESPnet repository
import soundfile
speech, rate = soundfile.read("./test_utils/ctc_align_test.wav")
# CTC segmentation
from espnet2.bin.asr_align import CTCSegmentation
aligner = CTCSegmentation( **wsjmodel , fs=rate )

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-1-9437fb12186a> in <module>
      7 # CTC segmentation
      8 from espnet2.bin.asr_align import CTCSegmentation
----> 9 aligner = CTCSegmentation( **wsjmodel , fs=rate )

/xxx/espnet/espnet2/bin/asr_align.py in __init__(self, asr_train_config, asr_model_file, fs, ngpu, batch_size, dtype, kaldi_style_text, text_converter, time_stamps, **ctc_segmentation_args)
    237         )
    238         asr_model.to(dtype=getattr(torch, dtype)).eval()
--> 239         self.preprocess_fn = ASRTask.build_preprocess_fn(asr_train_args, False)
    240 
    241         # Warn for nets with high memory consumption on long audio files

/xxx/espnet/espnet2/tasks/asr.py in build_preprocess_fn(cls, args, train)
    291         assert check_argument_types()
    292         if args.use_preprocessor:
--> 293             retval = CommonPreprocessor(
    294                 train=train,
    295                 token_type=args.token_type,

/xxx/espnet/espnet2/train/preprocessor.py in __init__(self, train, token_type, token_list, bpemodel, text_cleaner, g2p_type, unk_symbol, space_symbol, non_linguistic_symbols, delimiter, rir_scp, rir_apply_prob, noise_scp, noise_apply_prob, noise_db_range, speech_volume_normalize, speech_name, text_name)
    159             self.text_cleaner = TextCleaner(text_cleaner)
    160 
--> 161             self.tokenizer = build_tokenizer(
    162                 token_type=token_type,
    163                 bpemodel=bpemodel,

/xxx/espnet/espnet2/text/build_tokenizer.py in build_tokenizer(token_type, bpemodel, non_linguistic_symbols, remove_non_linguistic_symbols, space_symbol, delimiter, g2p_type)
     44 
     45     elif token_type == "char":
---> 46         return CharTokenizer(
     47             non_linguistic_symbols=non_linguistic_symbols,
     48             space_symbol=space_symbol,

/xxx/espnet/espnet2/text/char_tokenizer.py in __init__(self, non_linguistic_symbols, space_symbol, remove_non_linguistic_symbols)
     22         elif isinstance(non_linguistic_symbols, (Path, str)):
     23             non_linguistic_symbols = Path(non_linguistic_symbols)
---> 24             with non_linguistic_symbols.open("r", encoding="utf-8") as f:
     25                 self.non_linguistic_symbols = set(line.rstrip() for line in f)
     26         else:

/usr/lib/python3.9/pathlib.py in open(self, mode, buffering, encoding, errors, newline)
   1240         the built-in open() function does.
   1241         """
-> 1242         return io.open(self, mode, buffering, encoding, errors, newline,
   1243                        opener=self._opener)
   1244 

/usr/lib/python3.9/pathlib.py in _opener(self, name, flags, mode)
   1108     def _opener(self, name, flags, mode=0o666):
   1109         # A stub for the opener argument to built-in open()
-> 1110         return self._accessor.open(self, flags, mode)
   1111 
   1112     def _raw_open(self, flags, mode=0o777):

FileNotFoundError: [Errno 2] No such file or directory: 'data/nlsyms.txt'
abnerLing commented 3 years ago

Thanks @lumaku , I also tested with a pretrained librispeech model, and it searches for "exp/asr_stats_raw_bpe5000_sp/train/feats_stats.npz". You can work around that issue by running the asr_align.py script in the directory before exp and it works fine but I'm not sure if that's intentional.

Traceback (most recent call last):
  File "espnet2/bin/asr_align.py", line 827, in <module>
    main()
  File "espnet2/bin/asr_align.py", line 823, in main
    ctc_align(**kwargs)
  File "espnet2/bin/asr_align.py", line 632, in ctc_align
    aligner = CTCSegmentation(**model, **kwargs)
  File "espnet2/bin/asr_align.py", line 235, in __init__
    asr_model, asr_train_args = ASRTask.build_model_from_file(
  File "/home/abner/work/espnet/espnet2/tasks/abs_task.py", line 1776, in build_model_from_file
    model = cls.build_model(args)
  File "/home/abner/work/espnet/espnet2/tasks/asr.py", line 380, in build_model
    normalize = normalize_class(**args.normalize_conf)
  File "/home/abner/work/espnet/espnet2/layers/global_mvn.py", line 41, in __init__
    stats = np.load(stats_file)
  File "/home/abner/anaconda3/envs/es/lib/python3.8/site-packages/numpy/lib/npyio.py", line 417, in load
    fid = stack.enter_context(open(os_fspath(file), "rb"))
FileNotFoundError: [Errno 2] No such file or directory: 'exp/asr_stats_raw_bpe5000_sp/train/feats_stats.npz'
lumaku commented 3 years ago

The issue with data/nlsyms.txt' is specific to the CTC segmentation module, because it additionally uses a tokenizer, which the Speech2Text module not does.

About the second issue, I'm not sure. It should be possible to run the script anywhere, assumed the python dependencies are in PYTHONPATH. Does this error also happen when you import the model with Speech2Text using speech2text = Speech2Text(**model), or, with espnet2/bin/asr_inference.py?