DPO全参训练qwen-14b-chat微调后模型时deepspeed加载zeRO2爆显存OOM

LeonG7 commented 8 months ago

Reminder

[X] I have read the README and searched the existing issues.

Reproduction

ds_config_zero2.json

{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },
    "bf16": {
        "enabled": "auto"
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "none",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 100,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

    --deepspeed ds_config_zero2.json \
    --output_dir output_path \
    --model_name_or_path qwen-14b-sft \
    --lr_scheduler_type cosine \
    --warmup_ratio 0.1 \
    --stage dpo \
    --do_train \
    --template qwen \
    --finetuning_type full \
    --overwrite_output_dir \
    --overwrite_cache \
    --per_device_train_batch_size 4 \
    --gradient_accumulation_steps 32 \
    --adam_beta1 0.9 \
    --adam_beta2 0.95 \
    --weight_decay 0.1 \
    --logging_steps 1 \
    --save_strategy "epoch" \
    --save_only_model \
    --learning_rate 2e-6 \
    --num_train_epochs 1 \
    --dataloader_num_workers 16 \
    --plot_loss \
    --bf16 \
    --log_level info \
    --dataset comparison_gpt_zh

运行后日志显示：

    [2024-02-03 10:36:50,844] [INFO] [config.py:950:print_user_config]   json = {
    "train_batch_size": 1.024000e+03, 
    "train_micro_batch_size_per_gpu": 4, 
    "gradient_accumulation_steps": 32, 
    "gradient_clipping": 1.0, 
    "zero_allow_untested_optimizer": true, 
    "fp16": {
        "enabled": false, 
        "loss_scale": 0, 
        "initial_scale_power": 16, 
        "loss_scale_window": 1000, 
        "hysteresis": 2, 
        "min_loss_scale": 1
    }, 
    "zero_optimization": {
        "stage": 0, 
        "allgather_partitions": true, 
        "allgather_bucket_size": 2.000000e+08, 
        "reduce_scatter": true, 
        "reduce_bucket_size": 2.000000e+08, 
        "overlap_comm": true, 
        "contiguous_gradients": true
    }, 
    "steps_per_print": inf, 
    "bf16": {
        "enabled": true
    }
}

Traceback (most recent call last):
  File "src/train_bash.py", line 14, in <module>
    main()
  File "src/train_bash.py", line 5, in main
    run_exp()
  File "/mnt/volumes/dip-ulan-cpfs/NLP/train_code/LLaMA-Factory/src/llmtuner/train/tuner.py", line 37, in run_exp
    run_dpo(model_args, data_args, training_args, finetuning_args, callbacks)
  File "/mnt/volumes/dip-ulan-cpfs/NLP/train_code/LLaMA-Factory/src/llmtuner/train/dpo/workflow.py", line 65, in run_dpo
    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
  File "/home/jovyan/.local/lib/python3.8/site-packages/transformers/trainer.py", line 1537, in train
    return inner_training_loop(
  File "/home/jovyan/.local/lib/python3.8/site-packages/transformers/trainer.py", line 1672, in _inner_training_loop
    model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer)
  File "/home/jovyan/.local/lib/python3.8/site-packages/accelerate/accelerator.py", line 1284, in prepare
    result = self._prepare_deepspeed(*args)
  File "/home/jovyan/.local/lib/python3.8/site-packages/accelerate/accelerator.py", line 1666, in _prepare_deepspeed
    engine, optimizer, _, lr_scheduler = deepspeed.initialize(**kwargs)
  File "/home/jovyan/.local/lib/python3.8/site-packages/deepspeed/__init__.py", line 165, in initialize
    engine = DeepSpeedEngine(args=args,
  File "/home/jovyan/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 309, in __init__
    self._configure_optimizer(optimizer, model_parameters)
  File "/home/jovyan/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1185, in _configure_optimizer
    self.optimizer = self._configure_zero_optimizer(basic_optimizer)
  File "/home/jovyan/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1420, in _configure_zero_optimizer
    optimizer = DeepSpeedZeroOptimizer(
  File "/home/jovyan/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 485, in __init__
    self.initialize_optimizer_states()
  File "/home/jovyan/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 620, in initialize_optimizer_states
    self.optimizer.step()
  File "/home/jovyan/.local/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
    return wrapped(*args, **kwargs)
  File "/home/jovyan/.local/lib/python3.8/site-packages/torch/optim/optimizer.py", line 385, in wrapper
    out = func(*args, **kwargs)
  File "/home/jovyan/.local/lib/python3.8/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
    ret = func(self, *args, **kwargs)
  File "/home/jovyan/.local/lib/python3.8/site-packages/torch/optim/adamw.py", line 176, in step
    has_complex = self._init_group(
  File "/home/jovyan/.local/lib/python3.8/site-packages/torch/optim/adamw.py", line 127, in _init_group
    state["exp_avg_sq"] = torch.zeros_like(
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.60 GiB. GPU 0 has a total capacity of 79.35 GiB of which 5.00 GiB is free. Process 1846975 has 74.34 GiB memory in use. Of the allocated memory 72.73 GiB is allocated by PyTorch, and 1.41 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Expected behavior

No response

System Info

transformers 4.36.2 transformers-stream-generator 0.0.4 pytorch-quantization 2.1.2 torch 2.2.0 torch-tensorrt 1.4.0.dev0 torchdata 0.7.1 torchtext 0.17.0 torchvision 0.17.0 python 3.8.16 (3.10版本也试过，都会报错)

Others

No response

Amazing-J commented 8 months ago

+1

hiyouga commented 8 months ago

使用 zero3 试试

LeonG7 commented 8 months ago

使用 zero3 试试

zero3不报错，但是一直卡着不动

endxxxx commented 8 months ago

我用8*A800 80G 跑DPO lora训练qwen-7b-chat都把显存吃满了，RLHF lora都才只用了一半左右很奇怪（两个都用的是zero2）

LeonG7 commented 8 months ago

之前跑的是14b的模型，估计是模型放不下，换成7b的模型全参DPO能跑了

JerryDaHeLian commented 7 months ago

我的是tinyllama 1.2B的，训练用的DPO lora，A100-80G单卡，SFT和DPO 训练，基本按照readme操作的，数据用的readme上提供的，通过nvidia-smi 发现没有吃显卡内存，怎么回事儿？脚本如下： CUDA_VISIBLE_DEVICES=7 python3 src/train_bash.py \ --stage dpo \ --do_train \ --model_name_or_path tinyllama_base \ --adapter_name_or_path ./output/tinyllama_sft/checkpoint-1500 \ --create_new_adapter \ --dataset comparison_gpt4_zh \ --template llama2_zh \ --finetuning_type lora \ --lora_target q_proj,v_proj \ --output_dir ./output/tinyllama_dpo \ --per_device_train_batch_size 2 \ --gradient_accumulation_steps 4 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --save_steps 500 \ --learning_rate 1e-5 \ --num_train_epochs 1.0 \ --plot_loss \ --overwrite_output_dir true \ --bf16

hiyouga / LLaMA-Factory