huggingface / nanotron

Minimalistic large language model 3D-parallelism training
Apache License 2.0
1.14k stars 107 forks source link

[Refactor] Add ParallelContext to nanotron #11

Closed xrsrke closed 8 months ago

xrsrke commented 8 months ago

I trained a tiny Llama for 1500 steps in this PR and a reference repo (before the refactor) and compared the losses. They are all identical down to the last decimal place. And it's passed and failed all the tests that it did before the refactoring [link].

image

Config of the trained model

Command: FI_PROVIDER=efa USE_FAST=1 CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --rdzv-backend=c10d --nproc_per_node=4 run_train.py --config-file examples/config_phuc_tiny_llama.yaml

checkpoints:
  checkpoint_interval: 10
  checkpoints_path: /fsx/phuc/checkpoints/nanotron
  checkpoints_path_is_shared_file_system: true

data:
  dataset:
    dataset_overwrite_cache: false
    dataset_processing_num_proc_per_process: 1
    hf_dataset_config_name: null
    hf_dataset_or_datasets: HuggingFaceH4/scale_prompts_098236
    # hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
    # hf_dataset_or_datasets: HuggingFaceH4/lima_llama2
    hf_dataset_splits: train
    text_column_name: prompt
  num_loading_workers: 1
  seed: 42
general:
  benchmark_csv_path: null
  consumed_train_samples: null
  ignore_sanity_checks: false
  kill_switch_path: null
  project: debug
  run: tiny_llama
  seed: 42
  step: null
logging:
  iteration_step_info_interval: 1
  log_level: info
  log_level_replica: info
model:
  ddp_bucket_cap_mb: 25
  dtype: bfloat16
  init_method:
    std: 0.025
  make_vocab_size_divisible_by: 1
  model_config:
    bos_token_id: 1
    eos_token_id: 2
    hidden_act: silu
    hidden_size: 16
    initializer_range: 0.02
    intermediate_size: 64
    is_llama_config: true
    max_position_embeddings: 256
    num_attention_heads: 4
    num_hidden_layers: 6
    num_key_value_heads: 4
    pad_token_id: null
    pretraining_tp: 1
    rms_norm_eps: 1.0e-05
    rope_scaling: null
    tie_word_embeddings: true
    use_cache: true
    vocab_size: 50272
optimizer:
  accumulate_grad_in_fp32: true
  adam_beta1: 0.9
  adam_beta2: 0.95
  adam_eps: 1.0e-08
  clip_grad: 1.0
  learning_rate_scheduler:
    learning_rate: 0.0003
    lr_decay_steps: 8
    lr_decay_style: cosine
    lr_warmup_steps: 2
    lr_warmup_style: linear
    min_decay_lr: 1.0e-05
  torch_adam_is_fused: true
  weight_decay: 0.01
  zero_stage: 0
parallelism:
  dp: 2
  pp: 1
  pp_engine: 1f1b
  recompute_granularity: SELECTIVE
  tp: 2
  tp_linear_async_communication: true
  tp_mode: REDUCE_SCATTER
profiler: null
tokenizer:
  tokenizer_max_length: null
  tokenizer_name_or_path: gpt2
  tokenizer_revision: null
tokens:
  batch_accumulation_per_replica: 1
  limit_test_batches: 0
  limit_val_batches: 0
  micro_batch_size: 2
  sequence_length: 32
  # train_steps: 1000
  train_steps: 1579
  val_check_interval: -1