NVIDIA / NeMo

A scalable generative AI framework built for researchers and developers working on Large Language Models, Multimodal, and Speech AI (Automatic Speech Recognition and Text-to-Speech)
https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html
Apache License 2.0
11.84k stars 2.46k forks source link

Global shape mismatch for loaded ((1024, 768)) and expected ((512, 768)) tensor for key model.embedding.position_embeddings.weight #10715

Open Alireza3242 opened 2 weeks ago

Alireza3242 commented 2 weeks ago

Describe the bug I followed the instructions in: https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/nlp/nemo_megatron/gpt/gpt_training.html

the i replace 1024 with 512

python src/examples/nlp/language_modeling/megatron_gpt_pretraining.py  \
    --config-path=conf \
    --config-name=megatron_gpt_config \
    trainer.devices=1 \
    trainer.num_nodes=1 \
    trainer.max_epochs=null \
    trainer.max_steps=30000 \
    trainer.val_check_interval=300 \
    trainer.log_every_n_steps=50 \
    trainer.limit_val_batches=50 \
    trainer.limit_test_batches=50 \
    trainer.accumulate_grad_batches=1 \
    trainer.precision=16 \
    model.micro_batch_size=6 \
    model.global_batch_size=192 \
    model.tensor_model_parallel_size=1 \
    model.pipeline_model_parallel_size=1 \
    model.max_position_embeddings=512 \
    model.encoder_seq_length=512 \
    model.hidden_size=768 \
    model.ffn_hidden_size=3072 \
    model.num_layers=12 \
    model.num_attention_heads=12 \
    model.init_method_std=0.021 \
    model.hidden_dropout=0.1 \
    model.layernorm_epsilon=1e-5 \
    model.tokenizer.vocab_file=data/tokenizer/gpt2-vocab.json \
model.tokenizer.merge_file=data/tokenizer/gpt2-merges.txt \
    model.data.data_prefix=[1.0,data/tokenized_data/hfbpe_gpt_training_data_text_document] \
    model.data.num_workers=2 \
    model.data.seq_length=512 \
    model.data.splits_string=\'980,10,10\' \
    model.megatron_amp_O2=False \
    model.optim.name=fused_adam \
    model.optim.lr=6e-4 \
    model.optim.betas=[0.9,0.95] \
    model.optim.weight_decay=0.1 \
    model.optim.sched.name=CosineAnnealing \
    model.optim.sched.warmup_steps=750 \
    model.optim.sched.constant_steps=80000 \
    model.optim.sched.min_lr=6e-5 \
    exp_manager.resume_if_exists=True \
    exp_manager.resume_ignore_no_checkpoint=True \
    exp_manager.create_checkpoint_callback=True \
    exp_manager.checkpoint_callback_params.monitor=val_loss \
    exp_manager.checkpoint_callback_params.save_top_k=3 \
    exp_manager.checkpoint_callback_params.mode=min \
    exp_manager.checkpoint_callback_params.always_save_nemo=False

It get this error:

Error executing job with overrides: ['trainer.devices=1', 'trainer.num_nodes=1', 'trainer.max_epochs=null', 'trainer.max_steps=30000', 'trainer.val_check_interval=300', 'trainer.log_every_n_steps=50', 'trainer.limit_val_batches=50', 'trainer.limit_test_batches=50', 'trainer.accumulate_grad_batches=1', 'trainer.precision=16', 'model.micro_batch_size=6', 'model.global_batch_size=192', 'model.tensor_model_parallel_size=1', 'model.pipeline_model_parallel_size=1', 'model.max_position_embeddings=512', 'model.encoder_seq_length=512', 'model.fp8_amax_history_len=512', 'model.hidden_size=768', 'model.ffn_hidden_size=3072', 'model.num_layers=12', 'model.num_attention_heads=12', 'model.init_method_std=0.021', 'model.hidden_dropout=0.1', 'model.layernorm_epsilon=1e-5', 'model.tokenizer.vocab_file=data/tokenizer/gpt2-vocab.json', 'model.tokenizer.merge_file=data/tokenizer/gpt2-merges.txt', 'model.data.data_prefix=[1.0,data/tokenized_data/hfbpe_gpt_training_data_text_document]', 'model.data.num_workers=2', 'model.data.seq_length=512', "model.data.splits_string='980,10,10'", 'model.megatron_amp_O2=False', 'model.optim.name=fused_adam', 'model.optim.lr=6e-4', 'model.optim.betas=[0.9,0.95]', 'model.optim.weight_decay=0.1', 'model.optim.sched.name=CosineAnnealing', 'model.optim.sched.warmup_steps=750', 'model.optim.sched.constant_steps=80000', 'model.optim.sched.min_lr=6e-5', 'exp_manager.resume_if_exists=True', 'exp_manager.resume_ignore_no_checkpoint=True', 'exp_manager.create_checkpoint_callback=True', 'exp_manager.checkpoint_callback_params.monitor=val_loss', 'exp_manager.checkpoint_callback_params.save_top_k=3', 'exp_manager.checkpoint_callback_params.mode=min', 'exp_manager.checkpoint_callback_params.always_save_nemo=False']
Traceback (most recent call last):
  File "/app/src/examples/nlp/language_modeling/megatron_gpt_pretraining.py", line 42, in main
    trainer.fit(model)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 543, in fit
    call._call_and_handle_interrupt(
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt
    return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch
    return function(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 579, in _fit_impl
    self._run(model, ckpt_path=ckpt_path)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/trainer.py", line 973, in _run
    self._checkpoint_connector._restore_modules_and_callbacks(ckpt_path)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 397, in _restore_modules_and_callbacks
    self.resume_start(checkpoint_path)
  File "/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/checkpoint_connector.py", line 79, in resume_start
    loaded_checkpoint = self.trainer.strategy.load_checkpoint(checkpoint_path)
  File "/opt/NeMo/nemo/collections/nlp/parts/nlp_overrides.py", line 441, in load_checkpoint
    return self.checkpoint_io.load_checkpoint(checkpoint_path, sharded_state_dict=checkpoint)
  File "/opt/NeMo/nemo/utils/callbacks/dist_ckpt_io.py", line 78, in load_checkpoint
    return dist_checkpointing.load(
  File "/opt/megatron-lm/megatron/core/dist_checkpointing/serialization.py", line 135, in load
    loaded_state_dict = sharded_strategy.load(sharded_state_dict, checkpoint_dir)
  File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 43, in load
    dict_list_map_inplace(load_fn, sharded_state_dict)
  File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 180, in dict_list_map_inplace
    x[k] = dict_list_map_inplace(f, v)
  File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 180, in dict_list_map_inplace
    x[k] = dict_list_map_inplace(f, v)
  File "/opt/megatron-lm/megatron/core/dist_checkpointing/dict_utils.py", line 184, in dict_list_map_inplace
    return f(x)
  File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 80, in _load_from_array
    x = _load_regular_chunk(sharded_tensor, checkpoint_dir)
  File "/opt/megatron-lm/megatron/core/dist_checkpointing/strategies/tensorstore.py", line 107, in _load_regular_chunk
    raise CheckpointingException(_msg)
megatron.core.dist_checkpointing.core.CheckpointingException: Global shape mismatch for loaded ((1024, 768)) and expected ((512, 768)) tensor for key model.embedding.position_embeddings.weight

Environment details A100 nemo docker: nvcr.io/nvidia/nemo:24.05.01

Alireza3242 commented 2 weeks ago

I find it: I have to delete this folder: nemo_experiments/megatron_gpt/checkpoints