Closed zhaoyz1017 closed 1 week ago
Are you running this in a docker container? What command if so?
Are you running this in a docker container? What command if so? yes, I run it in docker.
docker run -it --name zhaomegatron -v /jfs/yuzhe.zhao:/home/zyz --gpus all nvcr.io/nvidia/pytorch:23.09-py3
Are you running this in a docker container? What command if so?
Thanks, I think this error is about docker container's shared memory. I fixed that with docker run --shm-size="64g"
I believe --ipc=host
should also work.
when i was running:
bash examples/pretrain_gpt_distributed.sh
It reports : `INFO:megatron.core.datasets.gpt_dataset: Load the sample index from 9e7c6a2dd63142a411059e0b84109cdd-GPTDataset-sample_index.npy INFO:megatron.core.datasets.gpt_dataset: Load the shuffle index from 9e7c6a2dd63142a411059e0b84109cdd-GPTDataset-shuffle_index.npy INFO:megatron.core.datasets.gpt_dataset:> total number of samples: 640 finished creating GPT datasets ... ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm). ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm). ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm). Traceback (most recent call last): File "/home/zyz/code/Megatron-LM-core_v0.6.0/pretrain_gpt.py", line 224, inFile "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 250, in pretrain Traceback (most recent call last): File "/home/zyz/code/Megatron-LM-core_v0.6.0/pretrain_gpt.py", line 224, in
= build_train_valid_test_data_iterators(
File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 1415, in build_train_valid_test_data_iterators
pretrain(train_valid_test_datasets_provider,
File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 250, in pretrain
= build_train_valid_test_data_iterators(
File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 1415, in build_train_valid_test_data_iterators
valid_data_iterator = _get_iterator(dl_type, valid_dataloader)pretrain(train_valid_test_datasets_provider,
File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 1400, in _get_iterator
File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 250, in pretrain valid_data_iterator = _get_iterator(dl_type, valid_dataloader) File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 1400, in _get_iterator return iter(dataloader) File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 436, in iter = build_train_valid_test_data_iterators( File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 1415, in build_train_valid_test_data_iterators return iter(dataloader) File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 436, in iter self._iterator = self._get_iterator()self._iterator = self._get_iterator()
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 388, in _get_iterator File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 388, in _get_iterator return _MultiProcessingDataLoaderIter(self) return _MultiProcessingDataLoaderIter(self) File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1024, in init File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1024, in init valid_data_iterator = _get_iterator(dl_type, valid_dataloader) File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 1400, in _get_iterator index_queue = multiprocessing_context.Queue() # type: ignore[var-annotated]index_queue = multiprocessing_context.Queue() # type: ignore[var-annotated]
File "/usr/lib/python3.10/multiprocessing/context.py", line 103, in Queue File "/usr/lib/python3.10/multiprocessing/context.py", line 103, in Queue return Queue(maxsize, ctx=self.get_context())return Queue(maxsize, ctx=self.get_context())
File "/usr/lib/python3.10/multiprocessing/queues.py", line 43, in init File "/usr/lib/python3.10/multiprocessing/queues.py", line 43, in init self._rlock = ctx.Lock()self._rlock = ctx.Lock()
File "/usr/lib/python3.10/multiprocessing/context.py", line 68, in Lock File "/usr/lib/python3.10/multiprocessing/context.py", line 68, in Lock return Lock(ctx=self.get_context()) File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 162, in init return Lock(ctx=self.get_context()) File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 162, in init SemLock.init(self, SEMAPHORE, 1, 1, ctx=ctx) File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 57, in init SemLock.init(self, SEMAPHORE, 1, 1, ctx=ctx) File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 57, in init sl = self._semlock = _multiprocessing.SemLock( sl = self._semlock = _multiprocessing.SemLock( OSError: [Errno 28] No space left on device return iter(dataloader)OSError [Errno 28] No space left on device File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 436, in iter self._iterator = self._get_iterator() File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 388, in _get_iterator return _MultiProcessingDataLoaderIter(self) File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1024, in init index_queue = multiprocessing_context.Queue() # type: ignore[var-annotated] File "/usr/lib/python3.10/multiprocessing/context.py", line 103, in Queue return Queue(maxsize, ctx=self.get_context()) File "/usr/lib/python3.10/multiprocessing/queues.py", line 43, in init self._rlock = ctx.Lock() File "/usr/lib/python3.10/multiprocessing/context.py", line 68, in Lock return Lock(ctx=self.get_context()) File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 162, in init SemLock.init(self, SEMAPHORE, 1, 1, ctx=ctx) File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 57, in init sl = self._semlock = _multiprocessing.SemLock( OSError: [Errno 28] No space left on device Traceback (most recent call last): File "/home/zyz/code/Megatron-LM-core_v0.6.0/pretrain_gpt.py", line 224, in
pretrain(train_valid_test_datasets_provider,
File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 250, in pretrain
= build_train_valid_test_data_iterators(
File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 1420, in build_train_valid_test_data_iterators
test_data_iterator = _get_iterator(dl_type, test_dataloader)
File "/home/zyz/code/Megatron-LM-core_v0.6.0/megatron/training/training.py", line 1400, in _get_iterator
return iter(dataloader)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 436, in iter
self._iterator = self._get_iterator()
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 388, in _get_iterator
return _MultiProcessingDataLoaderIter(self)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1015, in init
self._worker_result_queue = multiprocessing_context.Queue() # type: ignore[var-annotated]
File "/usr/lib/python3.10/multiprocessing/context.py", line 103, in Queue
return Queue(maxsize, ctx=self.get_context())
File "/usr/lib/python3.10/multiprocessing/queues.py", line 43, in init
self._rlock = ctx.Lock()
File "/usr/lib/python3.10/multiprocessing/context.py", line 68, in Lock
return Lock(ctx=self.get_context())
File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 162, in init
SemLock.init(self, SEMAPHORE, 1, 1, ctx=ctx)
File "/usr/lib/python3.10/multiprocessing/synchronize.py", line 57, in init
sl = self._semlock = _multiprocessing.SemLock(
OSError: [Errno 28] No space left on device
ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
ERROR: Unexpected bus error encountered in worker. This might be caused by insufficient shared memory (shm).
Exception ignored in atexit callback: <function _MultiProcessingDataLoaderIter._clean_up_worker at 0x7fb5a94af7f0>
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1475, in _clean_up_worker
w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.10/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
if not wait([self.sentinel], timeout):
File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, in wait
ready = selector.select(timeout)
File "/usr/lib/python3.10/selectors.py", line 416, in select
fd_event_list = self._selector.poll(timeout)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 56490) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
Exception ignored in atexit callback: <function _MultiProcessingDataLoaderIter._clean_up_worker at 0x7f86ce34f7f0>
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1475, in _clean_up_worker
w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.10/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
if not wait([self.sentinel], timeout):
File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, in wait
ready = selector.select(timeout)
File "/usr/lib/python3.10/selectors.py", line 416, in select
fd_event_list = self._selector.poll(timeout)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 56811) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
Exception ignored in atexit callback: <function _MultiProcessingDataLoaderIter._clean_up_worker at 0x7fc7fa34f7f0>
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1475, in _clean_up_worker
w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.10/multiprocessing/process.py", line 149, in join
res = self._popen.wait(timeout)
File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
if not wait([self.sentinel], timeout):
File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, in wait
ready = selector.select(timeout)
File "/usr/lib/python3.10/selectors.py", line 416, in select
fd_event_list = self._selector.poll(timeout)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 57242) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit.
Exception in thread Thread-3 (_pin_memory_loop):
Traceback (most recent call last):
File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
Exception ignored in atexit callback: <function _MultiProcessingDataLoaderIter._clean_up_worker at 0x7fc7fa34f7f0>
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py", line 1475, in _clean_up_worker
self.run()w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.10/threading.py", line 953, in run File "/usr/lib/python3.10/multiprocessing/process.py", line 149, in join res = self._popen.wait(timeout) File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait if not wait([self.sentinel], timeout): File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, in wait self._target(*self._args, **self._kwargs)
ready = selector.select(timeout) File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/pin_memory.py", line 54, in _pin_memory_loop
File "/usr/lib/python3.10/selectors.py", line 416, in select fd_event_list = self._selector.poll(timeout) File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler do_one_step() File "/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/pin_memory.py", line 31, in do_one_step _error_if_any_worker_fails() RuntimeError: DataLoader worker (pid 56434) is killed by signal: Bus error. It is possible that dataloader's workers are out of shared memory. Please try to raise your shared memory limit. r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) File "/usr/lib/python3.10/multiprocessing/queues.py", line 122, in get return _ForkingPickler.loads(res) File "/usr/local/lib/python3.10/dist-packages/torch/multiprocessing/reductions.py", line 316, in rebuild_storage_fd fd = df.detach() File "/usr/lib/python3.10/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: File "/usr/lib/python3.10/multiprocessing/resource_sharer.py", line 86, in get_connection c = Client(address, authkey=process.current_process().authkey) File "/usr/lib/python3.10/multiprocessing/connection.py", line 508, in Client answer_challenge(c, authkey) File "/usr/lib/python3.10/multiprocessing/connection.py", line 752, in answer_challenge message = connection.recv_bytes(256) # reject large message File "/usr/lib/python3.10/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) File "/usr/lib/python3.10/multiprocessing/connection.py", line 414, in _recv_bytes buf = self._recv(4) File "/usr/lib/python3.10/multiprocessing/connection.py", line 379, in _recv chunk = read(handle, remaining) ConnectionResetError: [Errno 104] Connection reset by peer [2024-06-19 08:07:08,726] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 54798 closing signal SIGTERM [2024-06-19 08:07:08,726] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 54800 closing signal SIGTERM [2024-06-19 08:07:08,791] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 1 (pid: 54799) of binary: /usr/bin/python Traceback (most recent call last): File "/usr/local/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==2.1.0a0+29c30b1', 'console_scripts', 'torchrun')())
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 797, in main
run(args)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 788, in run
elastic_launch(
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
I am not sure about what cause this Error and how to fix this problem