Closed roedoejet closed 5 months ago
In EveryVoice this is set to false by default use_phonological_feats: false
in config/everyvoice-text-to-spec.yaml
.
With language moh
for example, if we set that variable to true
and run preprocess we get this error below:
2024-03-11 12:07:07.923 | WARNING | everyvoice.preprocessor.preprocessor:preprocess:793 - Symbol '"' occurs once but was not declared in your configuration so it is being ignored.
Processing pfs on 1 CPU: 0%| | 0/1380 [00:00<?, ?it/s]
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/m │
│ odel/feature_prediction/FastSpeech2_lightning/fs2/cli/preprocess.py:35 in │
│ preprocess │
│ │
│ 32 │ │
│ 33 │ from ..config import FastSpeech2Config │
│ 34 │ │
│ ❱ 35 │ preprocessor, config, processed = preprocess_base_command( │
│ 36 │ │ model_config=FastSpeech2Config, │
│ 37 │ │ steps=[step.name for step in steps], │
│ 38 │ │ **kwargs, │
│ │
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/b │
│ ase_cli/helpers.py:111 in preprocess_base_command │
│ │
│ 108 │ preprocessor = Preprocessor(config) │
│ 109 │ if isinstance(config, FastSpeech2Config) and config.model.use_phon │
│ 110 │ │ steps.append("pfs") │
│ ❱ 111 │ preprocessor.preprocess( │
│ 112 │ │ cpus=cpus, │
│ 113 │ │ overwrite=overwrite, │
│ 114 │ │ to_process=steps, │
│ │
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/p │
│ reprocessor/preprocessor.py:785 in preprocess │
│ │
│ 782 │ │ │ │ process_fn = self.get_process_fn(process) │
│ 783 │ │ │ │ missing_symbols_before = Counter(self.text_processor.m │
│ 784 │ │ │ │ for f in tqdm(filelist, desc=f"Processing {process} on │
│ ❱ 785 │ │ │ │ │ process_fn(f) │
│ 786 │ │ │ │ # if only one of "pfs" or "text" is specified, missing │
│ 787 │ │ │ │ # will always be empty, but if both are specified this │
│ 788 │ │ │ │ # each process gets only its own missing symbols logge │
│ │
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/p │
│ reprocessor/preprocessor.py:603 in process_text │
│ │
│ 600 │ │ text_path = self.create_path(item, "text", basename) │
│ 601 │ │ if text_path.exists() and not self.overwrite: │
│ 602 │ │ │ return │
│ ❱ 603 │ │ text = self.extract_text_inputs( │
│ 604 │ │ │ item["text"], self.text_processor, use_pfs=use_pfs, quiet= │
│ 605 │ │ ) │
│ 606 │ │ save_tensor(text, text_path) │
│ │
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/p │
│ reprocessor/preprocessor.py:294 in extract_text_inputs │
│ │
│ 291 │ │ │ raise ValueError("Text processor not initialized") │
│ 292 │ │ if use_pfs: │
│ 293 │ │ │ return torch.Tensor( │
│ ❱ 294 │ │ │ │ text_processor.text_to_phonological_features(text, qui │
│ 295 │ │ │ ).long() │
│ 296 │ │ else: │
│ 297 │ │ │ return torch.Tensor(text_processor.text_to_sequence(text, │
│ │
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/t │
│ ext/__init__.py:96 in text_to_phonological_features │
│ │
│ 93 │ │ │ List of phonological feature vectors │
│ 94 │ │ """ │
│ 95 │ │ clean_text = self.text_to_tokens(text, quiet) │
│ ❱ 96 │ │ return get_features(clean_text) │
│ 97 │ │
│ 98 │ def clean_text(self, text: str) -> str: │
│ 99 │ │ """Converts some text to cleaned text""" │
│ │
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/t │
│ ext/features.py:102 in get_features │
│ │
│ 99 │ # tokens = tokenizer.tokenize(text) │
│ 100 │ punctuation_features = get_punctuation_features(tokens) │
│ 101 │ tone_features = get_tone_features(tokens) │
│ ❱ 102 │ spe_features = [char_to_vector_list(t) for t in tokens] │
│ 103 │ assert len(punctuation_features) == len(tone_features) == len(spe_ │
│ 104 │ return [ │
│ 105 │ │ spe_features[i] + tone_features[i] + punctuation_features[i] │
│ │
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/t │
│ ext/features.py:102 in <listcomp> │
│ │
│ 99 │ # tokens = tokenizer.tokenize(text) │
│ 100 │ punctuation_features = get_punctuation_features(tokens) │
│ 101 │ tone_features = get_tone_features(tokens) │
│ ❱ 102 │ spe_features = [char_to_vector_list(t) for t in tokens] │
│ 103 │ assert len(punctuation_features) == len(tone_features) == len(spe_ │
│ 104 │ return [ │
│ 105 │ │ spe_features[i] + tone_features[i] + punctuation_features[i] │
│ │
│ /gpfs/fs5/nrc/nrc-fs1/ict/others/u/tes001/TxT2SPEECH/EveryVoice/everyvoice/t │
│ ext/features.py:93 in char_to_vector_list │
│ │
│ 90 │ try: │
│ 91 │ │ return vec[0] │
│ 92 │ except IndexError: │
│ ❱ 93 │ │ breakpoint() │
│ 94 │
│ 95 │
│ 96 def get_features(tokens): │
│ │
│ /home/tes001/u/TxT2SPEECH/miniconda3_u20/envs/EveryVoice/lib/python3.10/bdb. │
│ py:94 in trace_dispatch │
│ │
│ 91 │ │ if event == 'call': │
│ 92 │ │ │ return self.dispatch_call(frame, arg) │
│ 93 │ │ if event == 'return': │
│ ❱ 94 │ │ │ return self.dispatch_return(frame, arg) │
│ 95 │ │ if event == 'exception': │
│ 96 │ │ │ return self.dispatch_exception(frame, arg) │
│ 97 │ │ if event == 'c_call': │
│ │
│ /home/tes001/u/TxT2SPEECH/miniconda3_u20/envs/EveryVoice/lib/python3.10/bdb. │
│ py:156 in dispatch_return │
│ │
│ 153 │ │ │ │ self.user_return(frame, arg) │
│ 154 │ │ │ finally: │
│ 155 │ │ │ │ self.frame_returning = None │
│ ❱ 156 │ │ │ if self.quitting: raise BdbQuit │
│ 157 │ │ │ # The user issued a 'next' or 'until' command. │
│ 158 │ │ │ if self.stopframe is frame and self.stoplineno != -1: │
│ 159 │ │ │ │ self._set_stopinfo(None, None) │
╰──────────────────────────────────────────────────────────────────────────────╯
BdbQuit
Keeping a list of issues to raise:
Because:
We need to handle the processing (normalization, phonemization, tokenization) of these various representations of text. To do that, I propose:
DatasetTextRepresentation
Enum field that requires users to specify the representation level of their text (Character|IPA|Arpabet)TargetTrainingRepresentationLevel
Enum field (characters|phones|phonological features) to models which use textg2p
library and useipatok
library to tokenize output (withunknown=True
to allow punctuation and other unknown characters through).character
sequence andphone
sequence to the filelist generated by the preprocessor if a valid g2p method for the languages exists. Join these sequences with forward slashes (ie. h/e/l/l/o h/o/w a/r/e y/o/u) since pipes are already taken in that format. Allow forward slashes to be escaped maybe?Rough sketch to help jog my memory: