Open Alireza3242 opened 2 weeks ago
Describe the bug I followed the instructions in: https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/gpt/gpt_training.html
the i replace 1024 with 512
python src/examples/nlp/language_modeling/megatron_gpt_pretraining.py \ --config-path=conf \ --config-name=megatron_gpt_config \ trainer.devices=1 \ trainer.num_nodes=1 \ trainer.max_epochs=null \ trainer.max_steps=30000 \ trainer.val_check_interval=300 \ trainer.log_every_n_steps=50 \ trainer.limit_val_batches=50 \ trainer.limit_test_batches=50 \ trainer.accumulate_grad_batches=1 \ trainer.precision=16 \ model.micro_batch_size=6 \ model.global_batch_size=192 \ model.tensor_model_parallel_size=1 \ model.pipeline_model_parallel_size=1 \ model.max_position_embeddings=512 \ model.encoder_seq_length=512 \ model.hidden_size=768 \ model.ffn_hidden_size=3072 \ model.num_layers=12 \ model.num_attention_heads=12 \ model.init_method_std=0.021 \ model.hidden_dropout=0.1 \ model.layernorm_epsilon=1e-5 \ model.tokenizer.vocab_file=data/tokenizer/gpt2-vocab.json \ model.tokenizer.merge_file=data/tokenizer/gpt2-merges.txt \ model.data.data_prefix=[1.0,data/tokenized_data/hfbpe_gpt_training_data_text_document] \ model.data.num_workers=2 \ model.data.seq_length=512 \ model.data.splits_string=\'980,10,10\' \ model.megatron_amp_O2=False \ model.optim.name=fused_adam \ model.optim.lr=6e-4 \ model.optim.betas=[0.9,0.95] \ model.optim.weight_decay=0.1 \ model.optim.sched.name=CosineAnnealing \ model.optim.sched.warmup_steps=750 \ model.optim.sched.constant_steps=80000 \ model.optim.sched.min_lr=6e-5 \ exp_manager.resume_if_exists=True \ exp_manager.resume_ignore_no_checkpoint=True \ exp_manager.create_checkpoint_callback=True \ exp_manager.checkpoint_callback_params.monitor=val_loss \ exp_manager.checkpoint_callback_params.save_top_k=3 \ exp_manager.checkpoint_callback_params.mode=min \ exp_manager.checkpoint_callback_params.always_save_nemo=False
It get this error:
Error executing job with overrides: ['trainer.devices=1', 'trainer.num_nodes=1', 'trainer.max_epochs=null', 'trainer.max_steps=30000', 'trainer.val_check_interval=300', 'trainer.log_every_n_steps=50', 'trainer.limit_val_batches=50', 'trainer.limit_test_batches=50', 'trainer.accumulate_grad_batches=1', 'trainer.precision=16', 'model.micro_batch_size=6', 'model.global_batch_size=192', 'model.tensor_model_parallel_size=1', 'model.pipeline_model_parallel_size=1', 'model.max_position_embeddings=512', 'model.encoder_seq_length=512', 'model.fp8_amax_history_len=512', 'model.hidden_size=768', 'model.ffn_hidden_size=3072', 'model.num_layers=12', 'model.num_attention_heads=12', 'model.init_method_std=0.021', 'model.hidden_dropout=0.1', 'model.layernorm_epsilon=1e-5', 'model.tokenizer.vocab_file=data/tokenizer/gpt2-vocab.json', 'model.tokenizer.merge_file=data/tokenizer/gpt2-merges.txt', 'model.data.data_prefix=[1.0,data/tokenized_data/hfbpe_gpt_training_data_text_document]', 'model.data.num_workers=2', 'model.data.seq_length=512', "model.data.splits_string='980,10,10'", 'model.megatron_amp_O2=False', 'model.optim.name=fused_adam', 'model.optim.lr=6e-4', 'model.optim.betas=[0.9,0.95]', 'model.optim.weight_decay=0.1', 'model.optim.sched.name=CosineAnnealing', 'model.optim.sched.warmup_steps=750', 'model.optim.sched.constant_steps=80000', 'model.optim.sched.min_lr=6e-5', 'exp_manager.resume_if_exists=True', 'exp_manager.resume_ignore_no_checkpoint=True', 'exp_manager.create_checkpoint_callback=True', 'exp_manager.checkpoint_callback_params.monitor=val_loss', 'exp_manager.checkpoint_callback_params.save_top_k=3', 'exp_manager.checkpoint_callback_params.mode=min', 'exp_manager.checkpoint_callback_params.always_save_nemo=False'] Traceback (most recent call last): File "/app/src/examples/nlp/language_modeling/megatron_gpt_pretraining.py", line 42, in main trainer.fit(model) File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit call._call_and_handle_interrupt( File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch return function(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 973, in _run self._checkpoint_connector._restore_modules_and_callbacks(ckpt_path) File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 397, in _restore_modules_and_callbacks self.resume_start(checkpoint_path) File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 79, in resume_start loaded_checkpoint = self.trainer.strategy.load_checkpoint(checkpoint_path) File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 441, in load_checkpoint return self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=checkpoint) File "/opt/NeMo/nemo/utils/callbacks/dist_ckpt_io.py", line 78, in load_checkpoint return dist_checkpointing.load( File "/opt/megatron-lm/megatron/core/dist_checkpointing/serialization.py", line 135, in load loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir) File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 43, in load dict_list_map_inplace(load_fn, sharded_state_dict) File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 180, in dict_list_map_inplace x[k] = dict_list_map_inplace(f, v) File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 180, in dict_list_map_inplace x[k] = dict_list_map_inplace(f, v) File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 184, in dict_list_map_inplace return f(x) File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 80, in _load_from_array x = _load_regular_chunk(sharded_tensor, checkpoint_dir) File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 107, in _load_regular_chunk raise CheckpointingException(_msg) megatron.core.dist_checkpointing.core.CheckpointingException: Global shape mismatch for loaded ((1024, 768)) and expected ((512, 768)) tensor for key model.embedding.position_embeddings.weight
Environment details A100 nemo docker: nvcr.io/nvidia/nemo:24.05.01
I find it: I have to delete this folder: nemo_experiments/megatron_gpt/checkpoints
Describe the bug I followed the instructions in: https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/gpt/gpt_training.html
the i replace 1024 with 512
It get this error:
Environment details A100 nemo docker: nvcr.io/nvidia/nemo:24.05.01