Open 2018211801 opened 1 year ago
hello~I will be appreciate if you can give any advice! my accelerate,peft,deepspeed,transformer are all the latest version. "device_map" is set to auto
CUDA_VISIBLE_DEVICES="0,5" torchrun --nnodes=1 --nproc_per_node=2 --master_port='29501' qlora_train.py --learning_rate=2e-5 --per_device_train_batch_size=46 --gradient_accumulation_steps=1 --deepspeed deepspeed_config_s2.json
deepspeed_config_s2.json :
{ "fp16": { "enabled": "auto", "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "optimizer": { "type": "AdamW", "params": { "lr": "auto", "betas": "auto", "eps": "auto", "weight_decay": "auto" } }, "scheduler": { "type": "WarmupLR", "params": { "warmup_min_lr": "auto", "warmup_max_lr": "auto", "warmup_num_steps": "auto" } }, "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "allgather_partitions": true, "allgather_bucket_size": 2e8, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 2e8, "contiguous_gradients": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", "steps_per_print": 2000, "train_batch_size": "auto", "train_micro_batch_size_per_gpu": "auto", "wall_clock_breakdown": false }
from dataclasses import dataclass, field from typing import Optional, Tuple import transformers from transformers import Trainer from transformers.models.llama import LlamaForCausalLM # noqa from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel ) from accelerate import Accelerator from transformers import BitsAndBytesConfig import torch from peft import ( prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel ) from peft.tuners.lora import LoraLayer from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR import bitsandbytes as bnb from nl2sql_dataset import Nl2SqlJsonlDataset import env import time import os @dataclass class ModelArguments: pretrain_path: str = field( default=f"{env.MODEL_ROOT.joinpath('llama-7b')}" # llama-13b-hf ) if_4bit: bool = False @dataclass class DataArguments: train_file: str = field( default=f"{env.INPUT_ROOT.joinpath('trainset/config.json')}", metadata={"help": "A josnl file containing the training corpus"}, ) validation_file: str = field( default=f"{env.INPUT_ROOT.joinpath('devset/config.json')}", metadata={"help": "A jsonl file containing the validation corpus"}, ) max_seq_length: int = field( default=512, metadata={"help": "Max sequence length for training"} ) pad_to_max_length: bool = field(default=False) @dataclass class TrainingArguments(transformers.TrainingArguments): cache_dir: Optional[str] = field(default=None) optim: str = field(default="adamw_torch") model_max_length: int = field( default=512, metadata={ "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)." }, ) num_train_epochs: int = 5 evaluation_strategy: str = field(default="epoch") save_strategy: str = "epoch" fp16: bool = True save_total_limit: int = 5 load_best_model_at_end: bool = False warmup_steps: int = 0 logging_steps: int = 1 gradient_checkpointing: bool = True ddp_timeout: int = 3600 output_dir: str = field( default=f"{env.OUTPUT_ROOT.joinpath(time.strftime('%Y年%m月%d日%H时%M分%S秒'))}", ) def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str): """Collects the state dict and dump to disk.""" state_dict = trainer.model.state_dict() if trainer.args.should_save: cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()} del state_dict trainer._save(output_dir, state_dict=cpu_state_dict) # noqa def parse_args() -> Tuple[ModelArguments, DataArguments, TrainingArguments]: parser = transformers.HfArgumentParser( (ModelArguments, DataArguments, TrainingArguments) ) return parser.parse_args_into_dataclasses() def find_all_linear_names(model): """ 找出所有全连接层,为所有全连接添加adapter """ cls = bnb.nn.Linear4bit lora_module_names = set() for name, module in model.named_modules(): if isinstance(module, cls): names = name.split('.') lora_module_names.add(names[0] if len(names) == 1 else names[-1]) if 'lm_head' in lora_module_names: # needed for 16-bit lora_module_names.remove('lm_head') return list(lora_module_names) class LoRATrainer(Trainer): """ 修改checkkpoint的保存逻辑,只保存lora """ def _save(self, output_dir: Optional[str] = None, state_dict=None): # If we are executing this function, we are the process zero, so we don't check for that. output_dir = output_dir if output_dir is not None else self.args.output_dir os.makedirs(output_dir, exist_ok=True) # 保存lora权重和配置 self.model.save_pretrained( output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors ) if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, "training_args.bin")) def train(): model_args, data_args, training_args = parse_args() accelerator = Accelerator() if "chatglm" in model_args.pretrain_path: print(model_args.pretrain_path) model = AutoModel.from_pretrained(model_args.pretrain_path, trust_remote_code=True, empty_init=False) else: model = AutoModelForCausalLM.from_pretrained(model_args.pretrain_path, load_in_4bit=True, device_map='auto', torch_dtype=torch.bfloat16, quantization_config=BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type='nf4' ),) model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) target_modules = find_all_linear_names(model) # 初始化lora配置 config = LoraConfig( r=64, lora_alpha=16, target_modules=target_modules, lora_dropout=0.05, bias="none", task_type="SEQ_2_SEQ_LM ", #"CAUSAL_LM", ) model = get_peft_model(model, config) model.print_trainable_parameters() #model.config.torch_dtype = torch.float32 print(model_args, data_args, training_args) dataset = Nl2SqlJsonlDataset( pretrain_path=model_args.pretrain_path, train_file_path=data_args.train_file, validation_file_path=data_args.validation_file, max_seg_length=data_args.max_seq_length, pad_to_max_length=data_args.pad_to_max_length, ) dataset.setup() # Tell Trainer not to attempt DataParallel model.is_parallelizable = True model.model_parallel = True # model, train_dataloader, eval_dataloader, optimizer, lr_scheduler = accelerator.prepare( # model, train_dataloader, eval_dataloader, optimizer, lr_scheduler # ) trainer = LoRATrainer( model=model, args=training_args, train_dataset=dataset.train_dataset, eval_dataset=dataset.val_dataset, data_collator=dataset.collate_fn, ) model.config.use_cache = False trainer.train() trainer.save_state() safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) if __name__ == "__main__": train()
the same problem
@2018211801 do you have any update on the issue? The same error happens to me.
Same issue. I don't think it can handle deepspeed multigpu parallel
hello~I will be appreciate if you can give any advice! my accelerate,peft,deepspeed,transformer are all the latest version. "device_map" is set to auto
CUDA_VISIBLE_DEVICES="0,5" torchrun --nnodes=1 --nproc_per_node=2 --master_port='29501' qlora_train.py --learning_rate=2e-5 --per_device_train_batch_size=46 --gradient_accumulation_steps=1 --deepspeed deepspeed_config_s2.json
deepspeed_config_s2.json :