Grouping texts in chunks of 512 (num_proc=10): 0%| | 0/361878 [00:01<?, ? examples/s]
multiprocess.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/multiprocess/pool.py", line 125, in worker
result = (True, func(*args, kwds))
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 614, in _write_generator_to_queue
for i, result in enumerate(func(kwargs)):
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 3489, in _map_single
writer.write_batch(batch)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_writer.py", line 560, in write_batch
pa_table = pa.Table.from_arrays(arrays, schema=schema)
File "pyarrow/table.pxi", line 3986, in pyarrow.lib.Table.from_arrays
File "pyarrow/table.pxi", line 3266, in pyarrow.lib.Table.validate
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 709
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "pretraining.py", line 752, in
main()
File "pretraining.py", line 538, in main
lm_datasets = raw_datasets.map(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/dataset_dict.py", line 868, in map
{
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/dataset_dict.py", line 869, in
k: dataset.map(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 592, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 3185, in map
for rank, done, content in iflatmap_unordered(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 654, in iflatmap_unordered
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 654, in
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/multiprocess/pool.py", line 771, in get
raise self._value
pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 709
Grouping texts in chunks of 512 (num_proc=10): 0%| | 0/361878 [00:01<?, ? examples/s]
multiprocess.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/multiprocess/pool.py", line 125, in worker
result = (True, func(args, kwds))
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 614, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 3489, in _map_single
writer.write_batch(batch)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_writer.py", line 560, in write_batch
pa_table = pa.Table.from_arrays(arrays, schema=schema)
File "pyarrow/table.pxi", line 3986, in pyarrow.lib.Table.from_arrays
File "pyarrow/table.pxi", line 3266, in pyarrow.lib.Table.validate
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 709
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "pretraining.py", line 752, in
main()
File "pretraining.py", line 538, in main
lm_datasets = raw_datasets.map(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/dataset_dict.py", line 868, in map
{
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/dataset_dict.py", line 869, in
k: dataset.map(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 592, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 3185, in map
for rank, done, content in iflatmap_unordered(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 654, in iflatmap_unordered
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 654, in
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/multiprocess/pool.py", line 771, in get
raise self._value
pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 709
[2024-01-15 17:06:14,620] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 237382) of binary: /home/centos/anaconda3/envs/cpt/bin/python
Traceback (most recent call last):
File "/home/centos/anaconda3/envs/cpt/bin/torchrun", line 8, in
sys.exit(main())
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(args, kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
Describe the bug
Grouping texts in chunks of 512 (num_proc=10): 0%| | 0/361878 [00:01<?, ? examples/s] multiprocess.pool.RemoteTraceback: """ Traceback (most recent call last): File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/multiprocess/pool.py", line 125, in worker result = (True, func(*args, kwds)) File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 614, in _write_generator_to_queue for i, result in enumerate(func(kwargs)): File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 3489, in _map_single writer.write_batch(batch) File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_writer.py", line 560, in write_batch pa_table = pa.Table.from_arrays(arrays, schema=schema) File "pyarrow/table.pxi", line 3986, in pyarrow.lib.Table.from_arrays File "pyarrow/table.pxi", line 3266, in pyarrow.lib.Table.validate File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 709 """
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "pretraining.py", line 752, in
main()
File "pretraining.py", line 538, in main
lm_datasets = raw_datasets.map(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/dataset_dict.py", line 868, in map
{
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/dataset_dict.py", line 869, in
k: dataset.map(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 592, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 3185, in map
for rank, done, content in iflatmap_unordered(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 654, in iflatmap_unordered
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 654, in
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/multiprocess/pool.py", line 771, in get
raise self._value
pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 709
Grouping texts in chunks of 512 (num_proc=10): 0%| | 0/361878 [00:01<?, ? examples/s]
multiprocess.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/multiprocess/pool.py", line 125, in worker
result = (True, func( args, kwds))
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 614, in _write_generator_to_queue
for i, result in enumerate(func(**kwargs)):
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 3489, in _map_single
writer.write_batch(batch)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_writer.py", line 560, in write_batch
pa_table = pa.Table.from_arrays(arrays, schema=schema)
File "pyarrow/table.pxi", line 3986, in pyarrow.lib.Table.from_arrays
File "pyarrow/table.pxi", line 3266, in pyarrow.lib.Table.validate
File "pyarrow/error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 709
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last): File "pretraining.py", line 752, in
main()
File "pretraining.py", line 538, in main
lm_datasets = raw_datasets.map(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/dataset_dict.py", line 868, in map
{
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/dataset_dict.py", line 869, in
k: dataset.map(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 592, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, *kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/arrow_dataset.py", line 3185, in map
for rank, done, content in iflatmap_unordered(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 654, in iflatmap_unordered
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/datasets/utils/py_utils.py", line 654, in
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/multiprocess/pool.py", line 771, in get
raise self._value
pyarrow.lib.ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 709
[2024-01-15 17:06:14,620] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 237382) of binary: /home/centos/anaconda3/envs/cpt/bin/python
Traceback (most recent call last):
File "/home/centos/anaconda3/envs/cpt/bin/torchrun", line 8, in
sys.exit(main())
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f( args, kwargs)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/centos/anaconda3/envs/cpt/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
pretraining.py FAILED
Failures: [1]: time : 2024-01-15_17:06:14 host : host190 rank : 1 (local_rank: 1) exitcode : 1 (pid: 237383) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure): [0]: time : 2024-01-15_17:06:14 host : host190 rank : 0 (local_rank: 0) exitcode : 1 (pid: 237382) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html