集成到transformer的trainer后会会爆显存，用adamW不会

您好，我将Adam-mini集成到trainer后，使用deepspeed训练会爆显存加载代码如下：

class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    r"""
    Inherits Seq2SeqTrainer to compute generative metrics such as BLEU and ROUGE.
    """

    def __init__(self, finetuning_args: "FinetuningArguments", **kwargs) -> None:
        super().__init__(**kwargs)
        self.finetuning_args = finetuning_args

    def create_optimizer(self) -> "torch.optim.Optimizer":
        if self.args.optim == "adam_mini":
            self.optimizer = Adam_mini(model = self.model,
                                  lr=self.args.learning_rate,
                                  betas=(self.args.adam_beta1, self.args.adam_beta2),
                                  weight_decay=self.args.weight_decay,
                                  model_sharding=True,
                                  n_feature=self.model.config.hidden_size,
                                  n_head=self.model.config.num_attention_heads
                                  )
        else:
            if self.optimizer is None:
                self.optimizer = create_custom_optimzer(self.model, self.args, self.finetuning_args)
        optimizer = super().create_optimizer()
        return optimizer

训练脚本如下：

WANDB_DISABLED=True nohup deepspeed --master_port 22345 --include localhost:$gpus --no_local_rank src/train_bash.py \
    --stage sft \
    --deepspeed ds_config_2.json \
    --model_name_or_path $MODEL_NAME_OR_PATH \
    --use_fast_tokenizer true \
    --do_train \
    --optim adam_mini \
    --finetuning_type full \
    --dataset stage_dpo_sft \
    --output_dir $OUTPUT_DIR \
    --per_device_train_batch_size 1 \
    --cutoff_len 8192 \
    --gradient_accumulation_steps 8 \
    --preprocessing_num_workers 16 \
    --lr_scheduler_type cosine \
    --evaluation_strategy "no" \
    --save_strategy "epoch" \
    --logging_steps 10 \
    --save_steps 1000 \
    --learning_rate 4e-6 \
    --warmup_ratio 0.1 \
    --num_train_epochs 3.0 \
    --use_fast_tokenizer true \
    --plot_loss \
    --gradient_checkpointing \
    --cache_dir $CACHE_DIR \
    --template qwen \
    --flash_attn \
    --bf16 > $LOG_PATH  &

ds_config配置如下：

{
    "bf16": {
       "enabled": true
     },
    "zero_allow_untested_optimizer": true,
    "zero_force_ds_cpu_optimizer": false,
    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": true,
        "allgather_bucket_size": 200000000,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 200000000,
        "contiguous_gradients": true,
                "stage3_gather_16bit_weights_on_model_save": true
    },
    "flops_profiler": {
        "enabled": true,
        "profile_step": 1,
        "module_depth": -1,
        "top_modules": 1,
        "detailed": true,
        "output_file": null
    },
    "comms_logger": {
        "enabled": true,
        "verbose": false,
        "prof_all": true,
        "debug": false
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 10,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": true
}

报错信息如下：

Traceback (most recent call last):
  File "src/train_bash.py", line 14, in <module>
    main()
  File "src/train_bash.py", line 5, in main
    run_exp()
  File "/data/qiucehao/project/code/huarong_code/adam-mini-code/LLaMA-Factory/src/llmtuner/train/tuner.py", line 32, in run_exp
    run_sft(model_args, data_args, training_args, finetuning_args, generating_args, callbacks)
  File "/data/qiucehao/project/code/huarong_code/adam-mini-code/LLaMA-Factory/src/llmtuner/train/sft/workflow.py", line 73, in run_sft
    train_result = trainer.train(resume_from_checkpoint=training_args.resume_from_checkpoint)
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/transformers/trainer.py", line 1780, in train
    return inner_training_loop(
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/transformers/trainer.py", line 2118, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/transformers/trainer.py", line 3045, in training_step
    self.accelerator.backward(loss)
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/accelerate/accelerator.py", line 2121, in backward
    self.deepspeed_engine_wrapped.backward(loss, **kwargs)
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/accelerate/utils/deepspeed.py", line 175, in backward
    self.engine.step()
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2181, in step
    self._take_model_step(lr_kwargs)
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 2087, in _take_model_step
    self.optimizer.step()
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1920, in step
    self._optimizer_step(i)
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 1827, in _optimizer_step
    self.optimizer.step()
  File "/data/qiucehao/project/venv38/lib/python3.8/site-packages/torch/optim/optimizer.py", line 373, in wrapper
    out = func(*args, **kwargs)
  File "/data/qiucehao/project/code/huarong_code/adam-mini-code/LLaMA-Factory/src/llmtuner/Adam_mini/Adam_mini_transformer.py", line 200, in step
    h = (state["v"].sqrt() / bias_correction_2_sqrt).add_(eps)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 8.80 GiB. GPU 2 has a total capacty of 79.33 GiB of which 423.81 MiB is free. Including non-PyTorch memory, this process has 78.90 GiB memory in use. Of the allocated memory 75.56 GiB is allocated by PyTorch, and 1.72 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
  0%|          | 0/2157 [00:24<?, ?it/s]

zyushun / Adam-mini

集成到transformer的trainer后会会爆显存，用adamW不会 #15