A scalable generative AI framework built for researchers and developers working on Large Language Models, Multimodal, and Speech AI (Automatic Speech Recognition and Text-to-Speech)
FSDP CPU offloading (model.fsdp=True and model.fsdp_cpu_offload=True) raises errors due to disallowed device placements (see error and full traceback below). This behavior is observed with both True and False values for model.use_cpu_initialization.
RuntimeError: An FSDP-managed module with parameter CPU offloading enabled has parameters on cuda:0. Make sure to not move the module from CPU when offloading parameters.
(On other processes, a different CUDA device index (e.g., cuda:1 is complained about.)
Traceback
```
Traceback (most recent call last):
File "/nemo/repos/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py", line 42, in main
trainer.fit(model)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit
call._call_and_handle_interrupt(
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _run
results = self._run_stage()
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1030, in _run_stage
self.fit_loop.run()
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 205, in run
self.advance()
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 363, in advance self.epoch_loop.run(self._data_fetcher)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 140, in run
self.advance(data_fetcher)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 250, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 190, in run
self._optimizer_step(batch_idx, closure)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 268, in _optimizer_step
call._call_lightning_module_hook(
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 159, in _call_lightning_module_hook
output = fn(*args, **kwargs)
File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py", line 1263, in optimizer_step
super().optimizer_step(*args, **kwargs)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1308, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 153, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 238, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/fsdp.py", line 149, in optimizer_step
return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 122, in optimizer_step
return optimizer.step(closure=closure, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py", line 391, in wrapper
out = func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/apex/optimizers/fused_adam.py", line 140, in step
loss = closure()
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 108, in _wrap_closure
closure_result = closure()
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 144, in __call__
self._result = self.closure(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 129, in closure
step_output = self._step_fn()
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 317, in _training_step
training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values())
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 311, in _call_strategy_hook
output = fn(*args, **kwargs)
File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step
return self.lightning_module.training_step(*args, **kwargs)
File "/nemo/repos/NeMo/nemo/utils/model_utils.py", line 434, in wrap_training_step
output_dict = wrapped(*args, **kwargs)
File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 834, in training_step
loss_mean = self.training_step_fwd_bwd_step_call(dataloader_iter, forward_only=False)
File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 770, in training_step_fwd_bwd_step_call
loss_mean = self.fwd_bwd_step(dataloader_iter, forward_only)
File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 703, in fwd_bwd_step
losses_reduced_per_micro_batch = fwd_bwd_function(
File "/nemo/repos/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 399, in forward_backward_no_pipelining
output_tensor, num_tokens = forward_step(
File "/nemo/repos/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 206, in forward_step output_tensor, loss_func = forward_step_func(data_iterator, model)
File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 1251, in fwd_output_and_loss_func
output_tensor = model(**forward_args)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 821, in forward
args, kwargs = _root_pre_forward(self, self, args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_runtime_utils.py", line 510, in _root_pre_forward
_lazy_init(state, module)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_runtime_utils.py", line 132, in _lazy_init
_check_flat_params_on_expected_device(state, root_module)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_runtime_utils.py", line 159, in _check_flat_params_on_expected_device
raise RuntimeError(
RuntimeError: An FSDP-managed module with parameter CPU offloading enabled has parameters on cuda:0. Make sure to not move the module from CPU when offloading parameters.
```
(On other processes, a different CUDA device index (e.g., `cuda:1` is complained about.)
No error should be raised or the config should be removed if it is not going to be supported. If usage of CPU offloading requires use_cpu_initialization=True for a fix, an error should be raised if it is not set correctly.
Environment overview
Environment location: [Apptainer, using NVIDIA PyTorch 24.05 Docker container with no modifications inside the container, but a venv outside the container]
Method of NeMo install: [pip install from source]. git clone https://github.com/NVIDIA/NeMo.git && cd NeMo && git checkout dda92f00de2785de46983d7aa4ac77cbb1b353ec && python -m pip install .[all]
Method of Megatron-LM install: [pip install from source]. git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout a645f89671be698612170539f2089dc15db66a80 && python -m pip install .
Describe the bug
FSDP CPU offloading (
model.fsdp=True
andmodel.fsdp_cpu_offload=True
) raises errors due to disallowed device placements (see error and full traceback below). This behavior is observed with both True and False values formodel.use_cpu_initialization
.(On other processes, a different CUDA device index (e.g.,
cuda:1
is complained about.)Traceback
``` Traceback (most recent call last): File "/nemo/repos/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py", line 42, in main trainer.fit(model) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit call._call_and_handle_interrupt( File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 44, in _call_and_handle_interrupt return trainer_fn(*args, **kwargs) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _run results = self._run_stage() File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/trainer.py", line 1030, in _run_stage self.fit_loop.run() File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 205, in run self.advance() File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/fit_loop.py", line 363, in advance self.epoch_loop.run(self._data_fetcher) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 140, in run self.advance(data_fetcher) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/training_epoch_loop.py", line 250, in advance batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 190, in run self._optimizer_step(batch_idx, closure) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 268, in _optimizer_step call._call_lightning_module_hook( File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 159, in _call_lightning_module_hook output = fn(*args, **kwargs) File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_base_model.py", line 1263, in optimizer_step super().optimizer_step(*args, **kwargs) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/core/module.py", line 1308, in optimizer_step optimizer.step(closure=optimizer_closure) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/core/optimizer.py", line 153, in step step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 238, in optimizer_step return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/fsdp.py", line 149, in optimizer_step return super().optimizer_step(optimizer, model=model, closure=closure, **kwargs) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 122, in optimizer_step return optimizer.step(closure=closure, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/optim/lr_scheduler.py", line 75, in wrapper return wrapped(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/optim/optimizer.py", line 391, in wrapper out = func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/apex/optimizers/fused_adam.py", line 140, in step loss = closure() File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/plugins/precision/precision.py", line 108, in _wrap_closure closure_result = closure() File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 144, in __call__ self._result = self.closure(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 129, in closure step_output = self._step_fn() File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/loops/optimization/automatic.py", line 317, in _training_step training_step_output = call._call_strategy_hook(trainer, "training_step", *kwargs.values()) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/call.py", line 311, in _call_strategy_hook output = fn(*args, **kwargs) File "/nemo/env/lib/python3.10/site-packages/pytorch_lightning/strategies/strategy.py", line 390, in training_step return self.lightning_module.training_step(*args, **kwargs) File "/nemo/repos/NeMo/nemo/utils/model_utils.py", line 434, in wrap_training_step output_dict = wrapped(*args, **kwargs) File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 834, in training_step loss_mean = self.training_step_fwd_bwd_step_call(dataloader_iter, forward_only=False) File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 770, in training_step_fwd_bwd_step_call loss_mean = self.fwd_bwd_step(dataloader_iter, forward_only) File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 703, in fwd_bwd_step losses_reduced_per_micro_batch = fwd_bwd_function( File "/nemo/repos/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 399, in forward_backward_no_pipelining output_tensor, num_tokens = forward_step( File "/nemo/repos/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 206, in forward_step output_tensor, loss_func = forward_step_func(data_iterator, model) File "/nemo/repos/NeMo/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py", line 1251, in fwd_output_and_loss_func output_tensor = model(**forward_args) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 821, in forward args, kwargs = _root_pre_forward(self, self, args, kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_runtime_utils.py", line 510, in _root_pre_forward _lazy_init(state, module) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_runtime_utils.py", line 132, in _lazy_init _check_flat_params_on_expected_device(state, root_module) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/fsdp/_runtime_utils.py", line 159, in _check_flat_params_on_expected_device raise RuntimeError( RuntimeError: An FSDP-managed module with parameter CPU offloading enabled has parameters on cuda:0. Make sure to not move the module from CPU when offloading parameters. ``` (On other processes, a different CUDA device index (e.g., `cuda:1` is complained about.)Steps/Code to reproduce bug
Expected behavior
No error should be raised or the config should be removed if it is not going to be supported. If usage of CPU offloading requires
use_cpu_initialization=True
for a fix, an error should be raised if it is not set correctly.Environment overview
venv
outside the container]git clone https://github.com/NVIDIA/NeMo.git && cd NeMo && git checkout dda92f00de2785de46983d7aa4ac77cbb1b353ec && python -m pip install .[all]
git clone https://github.com/NVIDIA/Megatron-LM.git && cd Megatron-LM && git checkout a645f89671be698612170539f2089dc15db66a80 && python -m pip install .