NVIDIA / NeMo

A scalable generative AI framework built for researchers and developers working on Large Language Models, Multimodal, and Speech AI (Automatic Speech Recognition and Text-to-Speech)
https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html
Apache License 2.0
11.83k stars 2.46k forks source link

JSONDecodeError: Extra data: line 1 column 126 (char 125) #4156

Closed one1ine closed 2 years ago

one1ine commented 2 years ago

Hello, I am trying to train a custom model using the provided code. I've compiled my custom Cyrillic manifests according to the instruction with each line being: {"audio_filepath": "/path/to/wav/file", "duration": float, "text": "transcript_in_cyrillic"}

However, when I setup the data i get the following error. I've double checked for any invisible characters and what not but my custom manifest is exactly like the one in the example.

---------------------------START------------------------------------------------

JSONDecodeError Traceback (most recent call last) /nfs/home/create_manifest.ipynb Cell 14' in <cell line: 13>() 2 trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=50) 3 #trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=2) 4 5 #import copy (...) 11 12 # Point to the data we'll use for fine-tuning as the training set ---> 13 quartznet_transfer_mon.setup_training_data(train_data_config=params['model']['train_ds']) 15 # Point to the new validation data for fine-tuning 16 quartznet_transfer_mon.setup_validation_data(val_data_config=params['model']['validation_ds'])

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/models/ctc_models.py:464, in EncDecCTCModel.setup_training_data(self, train_data_config) 461 # preserve config 462 self._update_dataset_config(dataset_name='train', config=train_data_config) --> 464 self._train_dl = self._setup_dataloader_from_config(config=train_data_config) 466 # Need to set this because if using an IterableDataset, the length of the dataloader is the total number 467 # of samples rather than the number of batches, and this messes up the tqdm progress bar. 468 # So we set the number of steps manually (to the correct number) to fix this. 469 if 'is_tarred' in train_data_config and train_data_config['is_tarred']: 470 # We also need to check if limit_train_batches is already set. 471 # If it's an int, we assume that the user has set it to something sane, i.e. <= # training batches, 472 # and don't change it. Otherwise, adjust batches accordingly if it's a float (including 1.0).

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/models/ctc_models.py:426, in EncDecCTCModel._setup_dataloader_from_config(self, config) 423 logging.warning(f"Could not load dataset as manifest_filepath was None. Provided config : {config}") 424 return None --> 426 dataset = audio_to_text_dataset.get_char_dataset(config=config, augmentor=augmentor) 428 if hasattr(dataset, 'collate_fn'): 429 collate_fn = dataset.collate_fn

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/data/audio_to_text_dataset.py:86, in get_char_dataset(config, augmentor) 83 if 'labels' not in config: 84 logging.warning(f"dataset does not have explicitly defined labels") ---> 86 dataset = audio_to_text.AudioToCharDataset( 87 manifest_filepath=config['manifest_filepath'], 88 labels=config.get('labels', None), 89 sample_rate=config['sample_rate'], 90 int_values=config.get('int_values', False), 91 augmentor=augmentor, 92 max_duration=config.get('max_duration', None), 93 min_duration=config.get('min_duration', None), 94 max_utts=config.get('max_utts', 0), 95 blank_index=config.get('blank_index', -1), 96 unk_index=config.get('unk_index', -1), 97 normalize=config.get('normalize_transcripts', False), 98 trim=config.get('trim_silence', False), 99 parser=config.get('parser', 'en'), 100 return_sample_id=config.get('return_sample_id', False), 101 ) 102 return dataset

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/data/audio_to_text.py:388, in AudioToCharDataset.init(self, manifest_filepath, labels, sample_rate, int_values, augmentor, max_duration, min_duration, max_utts, blank_index, unk_index, normalize, trim, bos_id, eos_id, pad_id, parser, return_sample_id) 382 self.labels = labels 384 parser = parsers.make_parser( 385 labels=labels, name=parser, unk_id=unk_index, blank_id=blank_index, do_normalize=normalize 386 ) --> 388 super().init( 389 manifest_filepath=manifest_filepath, 390 parser=parser, 391 sample_rate=sample_rate, 392 int_values=int_values, 393 augmentor=augmentor, 394 max_duration=max_duration, 395 min_duration=min_duration, 396 max_utts=max_utts, 397 trim=trim, 398 bos_id=bos_id, 399 eos_id=eos_id, 400 pad_id=pad_id, 401 return_sample_id=return_sample_id, 402 )

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/data/audio_to_text.py:274, in _AudioTextDataset.init(self, manifest_filepath, parser, sample_rate, int_values, augmentor, max_duration, min_duration, max_utts, trim, bos_id, eos_id, pad_id, return_sample_id) 271 if type(manifest_filepath) == str: 272 manifest_filepath = manifest_filepath.split(",") --> 274 self.manifest_processor = ASRManifestProcessor( 275 manifest_filepath=manifest_filepath, 276 parser=parser, 277 max_duration=max_duration, 278 min_duration=min_duration, 279 max_utts=max_utts, 280 bos_id=bos_id, 281 eos_id=eos_id, 282 pad_id=pad_id, 283 ) 284 self.featurizer = WaveformFeaturizer(sample_rate=sample_rate, int_values=int_values, augmentor=augmentor) 285 self.trim = trim

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/asr/data/audio_to_text.py:135, in ASRManifestProcessor.init(self, manifest_filepath, parser, max_duration, min_duration, max_utts, bos_id, eos_id, pad_id, index_by_file_id) 121 def init( 122 self, 123 manifest_filepath: str, (...) 131 index_by_file_id: bool = False, 132 ): 133 self.parser = parser --> 135 self.collection = collections.ASRAudioText( 136 manifests_files=manifest_filepath, 137 parser=parser, 138 min_duration=min_duration, 139 max_duration=max_duration, 140 max_number=max_utts, 141 index_by_file_id=index_by_file_id, 142 ) 144 self.eos_id = eos_id 145 self.bos_id = bos_id

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/common/parts/preprocessing/collections.py:206, in ASRAudioText.init(self, manifests_files, *args, kwargs) 196 """Parse lists of audio files, durations and transcripts texts. 197 198 Args: (...) 202 kwargs: Kwargs to pass to AudioText constructor. 203 """ 205 ids, audio_files, durations, texts, offsets, speakers, orig_srs, langs = [], [], [], [], [], [], [], [] --> 206 for item in manifest.item_iter(manifests_files): 207 ids.append(item['id']) 208 audio_files.append(item['audio_file'])

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/common/parts/preprocessing/manifest.py:72, in item_iter(manifests_files, parse_func) 70 for line in f: 71 k += 1 ---> 72 item = parse_func(line, manifest_file) 73 item['id'] = k 75 yield item

File ~/anaconda3/envs/nemo/lib/python3.8/site-packages/nemo/collections/common/parts/preprocessing/manifest.py:79, in parse_item(line, manifest_file) 78 def parse_item(line: str, manifest_file: str) -> Dict[str, Any]: ---> 79 item = json.loads(line) 81 # Audio file 82 if 'audio_filename' in item:

File ~/anaconda3/envs/nemo/lib/python3.8/json/init.py:357, in loads(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw) 352 del kw['encoding'] 354 if (cls is None and object_hook is None and 355 parse_int is None and parse_float is None and 356 parse_constant is None and object_pairs_hook is None and not kw): --> 357 return _default_decoder.decode(s) 358 if cls is None: 359 cls = JSONDecoder

File ~/anaconda3/envs/nemo/lib/python3.8/json/decoder.py:340, in JSONDecoder.decode(self, s, _w) 338 end = _w(s, end).end() 339 if end != len(s): --> 340 raise JSONDecodeError("Extra data", s, end) 341 return obj

JSONDecodeError: Extra data: line 1 column 126 (char 125) ---------------------------------------------------END-------------------------------------------------------

titu1994 commented 2 years ago

You may have a trailing new line in your manifest

one1ine commented 2 years ago

Yes, some of the lines were doubled. Because there were so many lines in the manifest, the solution was to run just the json file in a loop, checking each line with json.loads.

appledora commented 1 year ago

For future reference, this is how I identified the errors:

with open('<JSON MANIFEST PATH>', 'r') as f:
    # Read the file as a text file
    datas = f.readlines()
for data in datas:
  # Check if the last character is a newline
  if data[-1] == '\n':
      data = data.rstrip('\n')

  # Load the JSON data from the file
  try:
    json_data = json.loads(data)
  except JSONDecodeError as e:
    print(data)