Closed one1ine closed 2 years ago
You may have a trailing new line in your manifest
Yes, some of the lines were doubled. Because there were so many lines in the manifest, the solution was to run just the json file in a loop, checking each line with json.loads.
For future reference, this is how I identified the errors:
with open('<JSON MANIFEST PATH>', 'r') as f:
# Read the file as a text file
datas = f.readlines()
for data in datas:
# Check if the last character is a newline
if data[-1] == '\n':
data = data.rstrip('\n')
# Load the JSON data from the file
try:
json_data = json.loads(data)
except JSONDecodeError as e:
print(data)
Hello, I am trying to train a custom model using the provided code. I've compiled my custom Cyrillic manifests according to the instruction with each line being: {"audio_filepath": "/path/to/wav/file", "duration": float, "text": "transcript_in_cyrillic"}
However, when I setup the data i get the following error. I've double checked for any invisible characters and what not but my custom manifest is exactly like the one in the example.
---------------------------START------------------------------------------------
JSONDecodeError Traceback (most recent call last) /nfs/home/create_manifest.ipynb Cell 14' in <cell line: 13>() 2 trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=50) 3 #trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=2) 4 5 #import copy (...) 11 12 # Point to the data we'll use for fine-tuning as the training set ---> 13 quartznet_transfer_mon.setup_training_data(train_data_config=params['model']['train_ds']) 15 # Point to the new validation data for fine-tuning 16 quartznet_transfer_mon.setup_validation_data(val_data_config=params['model']['validation_ds'])
File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/models/ctc_models.py:464, in EncDecCTCModel.setup_training_data(self, train_data_config) 461 # preserve config 462 self._update_dataset_config(dataset_name='train', config=train_data_config) --> 464 self._train_dl = self._setup_dataloader_from_config(config=train_data_config) 466 # Need to set this because if using an IterableDataset, the length of the dataloader is the total number 467 # of samples rather than the number of batches, and this messes up the tqdm progress bar. 468 # So we set the number of steps manually (to the correct number) to fix this. 469 if 'is_tarred' in train_data_config and train_data_config['is_tarred']: 470 # We also need to check if limit_train_batches is already set. 471 # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, 472 # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0).
File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/models/ctc_models.py:426, in EncDecCTCModel._setup_dataloader_from_config(self, config) 423 logging.warning(f"Could not load dataset as
manifest_filepath
was None. Provided config : {config}") 424 return None --> 426 dataset = audio_to_text_dataset.get_char_dataset(config=config, augmentor=augmentor) 428 if hasattr(dataset, 'collate_fn'): 429 collate_fn = dataset.collate_fnFile ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/data/audio_to_text_dataset.py:86, in get_char_dataset(config, augmentor) 83 if 'labels' not in config: 84 logging.warning(f"dataset does not have explicitly defined labels") ---> 86 dataset = audio_to_text.AudioToCharDataset( 87 manifest_filepath=config['manifest_filepath'], 88 labels=config.get('labels', None), 89 sample_rate=config['sample_rate'], 90 int_values=config.get('int_values', False), 91 augmentor=augmentor, 92 max_duration=config.get('max_duration', None), 93 min_duration=config.get('min_duration', None), 94 max_utts=config.get('max_utts', 0), 95 blank_index=config.get('blank_index', -1), 96 unk_index=config.get('unk_index', -1), 97 normalize=config.get('normalize_transcripts', False), 98 trim=config.get('trim_silence', False), 99 parser=config.get('parser', 'en'), 100 return_sample_id=config.get('return_sample_id', False), 101 ) 102 return dataset
File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/data/audio_to_text.py:388, in AudioToCharDataset.init(self, manifest_filepath, labels, sample_rate, int_values, augmentor, max_duration, min_duration, max_utts, blank_index, unk_index, normalize, trim, bos_id, eos_id, pad_id, parser, return_sample_id) 382 self.labels = labels 384 parser = parsers.make_parser( 385 labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize 386 ) --> 388 super().init( 389 manifest_filepath=manifest_filepath, 390 parser=parser, 391 sample_rate=sample_rate, 392 int_values=int_values, 393 augmentor=augmentor, 394 max_duration=max_duration, 395 min_duration=min_duration, 396 max_utts=max_utts, 397 trim=trim, 398 bos_id=bos_id, 399 eos_id=eos_id, 400 pad_id=pad_id, 401 return_sample_id=return_sample_id, 402 )
File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/data/audio_to_text.py:274, in _AudioTextDataset.init(self, manifest_filepath, parser, sample_rate, int_values, augmentor, max_duration, min_duration, max_utts, trim, bos_id, eos_id, pad_id, return_sample_id) 271 if type(manifest_filepath) == str: 272 manifest_filepath = manifest_filepath.split(",") --> 274 self.manifest_processor = ASRManifestProcessor( 275 manifest_filepath=manifest_filepath, 276 parser=parser, 277 max_duration=max_duration, 278 min_duration=min_duration, 279 max_utts=max_utts, 280 bos_id=bos_id, 281 eos_id=eos_id, 282 pad_id=pad_id, 283 ) 284 self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) 285 self.trim = trim
File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/data/audio_to_text.py:135, in ASRManifestProcessor.init(self, manifest_filepath, parser, max_duration, min_duration, max_utts, bos_id, eos_id, pad_id, index_by_file_id) 121 def init( 122 self, 123 manifest_filepath: str, (...) 131 index_by_file_id: bool = False, 132 ): 133 self.parser = parser --> 135 self.collection = collections.ASRAudioText( 136 manifests_files=manifest_filepath, 137 parser=parser, 138 min_duration=min_duration, 139 max_duration=max_duration, 140 max_number=max_utts, 141 index_by_file_id=index_by_file_id, 142 ) 144 self.eos_id = eos_id 145 self.bos_id = bos_id
File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/common/parts/preprocessing/collections.py:206, in ASRAudioText.init(self, manifests_files, *args, kwargs) 196 """Parse lists of audio files, durations and transcripts texts. 197 198 Args: (...) 202 kwargs: Kwargs to pass to
AudioText
constructor. 203 """ 205 ids, audio_files, durations, texts, offsets, speakers, orig_srs, langs = [], [], [], [], [], [], [], [] --> 206 for item in manifest.item_iter(manifests_files): 207 ids.append(item['id']) 208 audio_files.append(item['audio_file'])File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/common/parts/preprocessing/manifest.py:72, in item_iter(manifests_files, parse_func) 70 for line in f: 71 k += 1 ---> 72 item = parse_func(line, manifest_file) 73 item['id'] = k 75 yield item
File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/common/parts/preprocessing/manifest.py:79, in parse_item(line, manifest_file) 78 def parse_item(line: str, manifest_file: str) -> Dict[str, Any]: ---> 79 item = json.loads(line) 81 # Audio file 82 if 'audio_filename' in item:
File ~/anaconda3/envs/nemo/lib/python3.8/json/init.py:357, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 352 del kw['encoding'] 354 if (cls is None and object_hook is None and 355 parse_int is None and parse_float is None and 356 parse_constant is None and object_pairs_hook is None and not kw): --> 357 return _default_decoder.decode(s) 358 if cls is None: 359 cls = JSONDecoder
File ~/anaconda3/envs/nemo/lib/python3.8/json/decoder.py:340, in JSONDecoder.decode(self, s, _w) 338 end = _w(s, end).end() 339 if end != len(s): --> 340 raise JSONDecodeError("Extra data", s, end) 341 return obj
JSONDecodeError: Extra data: line 1 column 126 (char 125) ---------------------------------------------------END-------------------------------------------------------