Hi. Thank you for your great project. But I encounter a problem and I cannot fix it.
I run file helper/create_train.py and produce a json file with format:
When I use this file for run_pre_training.py, I encounter error
───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/json/json. │
│ py:152 in _generate_tables │
│ │
│ 149 │ │ │ │ │ │ except pa.ArrowInvalid as e: │
│ 150 │ │ │ │ │ │ │ try: │
│ 151 │ │ │ │ │ │ │ │ with open(file, encoding="utf-8") as f │
│ ❱ 152 │ │ │ │ │ │ │ │ │ dataset = json.load(f) │
│ 153 │ │ │ │ │ │ │ except json.JSONDecodeError: │
│ 154 │ │ │ │ │ │ │ │ logger.error(f"Failed to read file '{f │
│ 155 │ │ │ │ │ │ │ │ raise e │
│ │
│ /opt/conda/lib/python3.10/json/__init__.py:293 in load │
│ │
│ 290 │ To use a custom ``JSONDecoder`` subclass, specify it with the ``cl │
│ 291 │ kwarg; otherwise ``JSONDecoder`` is used. │
│ 292 │ """ │
│ ❱ 293 │ return loads(fp.read(), │
│ 294 │ │ cls=cls, object_hook=object_hook, │
│ 295 │ │ parse_float=parse_float, parse_int=parse_int, │
│ 296 │ │ parse_constant=parse_constant, object_pairs_hook=object_pairs_ │
│ │
│ /opt/conda/lib/python3.10/json/__init__.py:346 in loads │
│ │
│ 343 │ if (cls is None and object_hook is None and │
│ 344 │ │ │ parse_int is None and parse_float is None and │
│ 345 │ │ │ parse_constant is None and object_pairs_hook is None and n │
│ ❱ 346 │ │ return _default_decoder.decode(s) │
│ 347 │ if cls is None: │
│ 348 │ │ cls = JSONDecoder │
│ 349 │ if object_hook is not None: │
│ │
│ /opt/conda/lib/python3.10/json/decoder.py:337 in decode │
│ │
│ 334 │ │ containing a JSON document). │
│ 335 │ │ │
│ 336 │ │ """ │
│ ❱ 337 │ │ obj, end = self.raw_decode(s, idx=_w(s, 0).end()) │
│ 338 │ │ end = _w(s, end).end() │
│ 339 │ │ if end != len(s): │
│ 340 │ │ │ raise JSONDecodeError("Extra data", s, end) │
│ │
│ /opt/conda/lib/python3.10/json/decoder.py:355 in raw_decode │
│ │
│ 352 │ │ try: │
│ 353 │ │ │ obj, end = self.scan_once(s, idx) │
│ 354 │ │ except StopIteration as err: │
│ ❱ 355 │ │ │ raise JSONDecodeError("Expecting value", s, err.value) fro │
│ 356 │ │ return obj, end │
│ 357 │
╰──────────────────────────────────────────────────────────────────────────────╯
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
During handling of the above exception, another exception occurred:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1860 in │
│ _prepare_split_single │
│ │
│ 1857 │ │ │ ) │
│ 1858 │ │ │ try: │
│ 1859 │ │ │ │ _time = time.time() │
│ ❱ 1860 │ │ │ │ for _, table in generator: │
│ 1861 │ │ │ │ │ if max_shard_size is not None and writer._num_byt │
│ 1862 │ │ │ │ │ │ num_examples, num_bytes = writer.finalize() │
│ 1863 │ │ │ │ │ │ writer.close() │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/json/json. │
│ py:155 in _generate_tables │
│ │
│ 152 │ │ │ │ │ │ │ │ │ dataset = json.load(f) │
│ 153 │ │ │ │ │ │ │ except json.JSONDecodeError: │
│ 154 │ │ │ │ │ │ │ │ logger.error(f"Failed to read file '{f │
│ ❱ 155 │ │ │ │ │ │ │ │ raise e │
│ 156 │ │ │ │ │ │ │ # If possible, parse the file as a list of │
│ 157 │ │ │ │ │ │ │ if isinstance(dataset, list): # list is t │
│ 158 │ │ │ │ │ │ │ │ try: │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/json/json. │
│ py:131 in _generate_tables │
│ │
│ 128 │ │ │ │ │ │ try: │
│ 129 │ │ │ │ │ │ │ while True: │
│ 130 │ │ │ │ │ │ │ │ try: │
│ ❱ 131 │ │ │ │ │ │ │ │ │ pa_table = paj.read_json( │
│ 132 │ │ │ │ │ │ │ │ │ │ io.BytesIO(batch), read_option │
│ 133 │ │ │ │ │ │ │ │ │ ) │
│ 134 │ │ │ │ │ │ │ │ │ break │
│ │
│ /kaggle/working/zalo_ltr_2021/pyarrow/_json.pyx:259 in │
│ pyarrow._json.read_json │
│ │
│ [Errno 2] No such file or directory: │
│ '/kaggle/working/zalo_ltr_2021/pyarrow/_json.pyx' │
│ │
│ /kaggle/working/zalo_ltr_2021/pyarrow/error.pxi:144 in │
│ pyarrow.lib.pyarrow_internal_check_status │
│ │
│ [Errno 2] No such file or directory: │
│ '/kaggle/working/zalo_ltr_2021/pyarrow/error.pxi' │
│ │
│ /kaggle/working/zalo_ltr_2021/pyarrow/error.pxi:100 in │
│ pyarrow.lib.check_status │
│ │
│ [Errno 2] No such file or directory: │
│ '/kaggle/working/zalo_ltr_2021/pyarrow/error.pxi' │
╰──────────────────────────────────────────────────────────────────────────────╯
ArrowInvalid: JSON parse error: Invalid value. in row 0
The above exception was the direct cause of the following exception:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /kaggle/working/zalo_ltr_2021/Condenser/run_pre_training.py:202 in <module> │
│ │
│ 199 │
│ 200 │
│ 201 if __name__ == "__main__": │
│ ❱ 202 │ main() │
│ 203 │
│ │
│ /kaggle/working/zalo_ltr_2021/Condenser/run_pre_training.py:95 in main │
│ │
│ 92 │ # Set seed before initializing model. │
│ 93 │ set_seed(training_args.seed) │
│ 94 │ │
│ ❱ 95 │ train_set = load_dataset( │
│ 96 │ │ 'json', │
│ 97 │ │ data_files=data_args.train_path, │
│ 98 │ │ block_size=2**25, │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/load.py:1782 in │
│ load_dataset │
│ │
│ 1779 │ try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES │
│ 1780 │ │
│ 1781 │ # Download and prepare data │
│ ❱ 1782 │ builder_instance.download_and_prepare( │
│ 1783 │ │ download_config=download_config, │
│ 1784 │ │ download_mode=download_mode, │
│ 1785 │ │ verification_mode=verification_mode, │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:872 in │
│ download_and_prepare │
│ │
│ 869 │ │ │ │ │ │ │ prepare_split_kwargs["max_shard_size"] = │
│ 870 │ │ │ │ │ │ if num_proc is not None: │
│ 871 │ │ │ │ │ │ │ prepare_split_kwargs["num_proc"] = num_pr │
│ ❱ 872 │ │ │ │ │ │ self._download_and_prepare( │
│ 873 │ │ │ │ │ │ │ dl_manager=dl_manager, │
│ 874 │ │ │ │ │ │ │ verification_mode=verification_mode, │
│ 875 │ │ │ │ │ │ │ **prepare_split_kwargs, │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:967 in │
│ _download_and_prepare │
│ │
│ 964 │ │ │ │
│ 965 │ │ │ try: │
│ 966 │ │ │ │ # Prepare split will record examples associated to th │
│ ❱ 967 │ │ │ │ self._prepare_split(split_generator, **prepare_split_ │
│ 968 │ │ │ except OSError as e: │
│ 969 │ │ │ │ raise OSError( │
│ 970 │ │ │ │ │ "Cannot find data file. " │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1749 in │
│ _prepare_split │
│ │
│ 1746 │ │ │ gen_kwargs = split_generator.gen_kwargs │
│ 1747 │ │ │ job_id = 0 │
│ 1748 │ │ │ with pbar: │
│ ❱ 1749 │ │ │ │ for job_id, done, content in self._prepare_split_sing │
│ 1750 │ │ │ │ │ gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_ │
│ 1751 │ │ │ │ ): │
│ 1752 │ │ │ │ │ if done: │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1892 in │
│ _prepare_split_single │
│ │
│ 1889 │ │ │ # Ignore the writer's error for no examples written to th │
│ 1890 │ │ │ if isinstance(e, SchemaInferenceError) and e.__context__ │
│ 1891 │ │ │ │ e = e.__context__ │
│ ❱ 1892 │ │ │ raise DatasetGenerationError("An error occurred while gen │
│ 1893 │ │ │
│ 1894 │ │ yield job_id, True, (total_num_examples, total_num_bytes, wri │
│ 1895 │
╰──────────────────────────────────────────────────────────────────────────────╯
DatasetGenerationError: An error occurred while generating the dataset
I guess possibly the problem is incompatible version of datasets and transformers. But I tried many versions of datasets, I still encounter error. Can you help me fix this? Thank you so much!
Hi. Thank you for your great project. But I encounter a problem and I cannot fix it. I run file helper/create_train.py and produce a json file with format:
When I use this file for run_pre_training.py, I encounter error
I guess possibly the problem is incompatible version of datasets and transformers. But I tried many versions of datasets, I still encounter error. Can you help me fix this? Thank you so much!