multi-gpu get error:Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:7

hello~I will be appreciate if you can give any advice！ my accelerate,peft,deepspeed,transformer are all the latest version. "device_map" is set to auto

CUDA_VISIBLE_DEVICES="0,5" torchrun --nnodes=1 --nproc_per_node=2 --master_port='29501' qlora_train.py --learning_rate=2e-5 --per_device_train_batch_size=46 --gradient_accumulation_steps=1 --deepspeed deepspeed_config_s2.json

deepspeed_config_s2.json :

{
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
        "hysteresis": 2,
        "min_loss_scale": 1
    },

    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": "auto",
            "betas": "auto",
            "eps": "auto",
            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": "auto",
            "warmup_max_lr": "auto",
            "warmup_num_steps": "auto"
        }
    },

    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8,
        "overlap_comm": true,
        "reduce_scatter": true,
        "reduce_bucket_size": 2e8,
        "contiguous_gradients": true
    },

    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
}

from dataclasses import dataclass, field
from typing import Optional, Tuple
import transformers
from transformers import Trainer
from transformers.models.llama import LlamaForCausalLM  # noqa
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModel
)
from accelerate import Accelerator
from transformers import BitsAndBytesConfig
import torch
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
import bitsandbytes as bnb

from nl2sql_dataset import Nl2SqlJsonlDataset
import env
import time
import os

@dataclass
class ModelArguments:
    pretrain_path: str = field(
        default=f"{env.MODEL_ROOT.joinpath('llama-7b')}" # llama-13b-hf
    )
    if_4bit: bool = False

@dataclass
class DataArguments:
    train_file: str = field(
        default=f"{env.INPUT_ROOT.joinpath('trainset/config.json')}",
        metadata={"help": "A josnl file containing the training corpus"},
    )
    validation_file: str = field(
        default=f"{env.INPUT_ROOT.joinpath('devset/config.json')}",
        metadata={"help": "A jsonl file containing the validation corpus"},
    )
    max_seq_length: int = field(
        default=512, metadata={"help": "Max sequence length for training"}
    )
    pad_to_max_length: bool = field(default=False)

@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=512,
        metadata={
            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
        },
    )
    num_train_epochs: int = 5
    evaluation_strategy: str = field(default="epoch")
    save_strategy: str = "epoch"
    fp16: bool = True
    save_total_limit: int = 5
    load_best_model_at_end: bool = False
    warmup_steps: int = 0
    logging_steps: int = 1
    gradient_checkpointing: bool = True
    ddp_timeout: int = 3600
    output_dir: str = field(
        default=f"{env.OUTPUT_ROOT.joinpath(time.strftime('%Y年%m月%d日%H时%M分%S秒'))}",
    )

def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
    """Collects the state dict and dump to disk."""
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
        del state_dict
        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa

def parse_args() -> Tuple[ModelArguments, DataArguments, TrainingArguments]:
    parser = transformers.HfArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments)
    )
    return parser.parse_args_into_dataclasses()

def find_all_linear_names(model):
    """
    找出所有全连接层，为所有全连接添加adapter
    """
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

class LoRATrainer(Trainer):
    """
    修改checkkpoint的保存逻辑，只保存lora
    """
    def _save(self, output_dir: Optional[str] = None, state_dict=None):
        # If we are executing this function, we are the process zero, so we don't check for that.
        output_dir = output_dir if output_dir is not None else self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        # 保存lora权重和配置
        self.model.save_pretrained(
            output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
        )

        if self.tokenizer is not None:
            self.tokenizer.save_pretrained(output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(self.args, os.path.join(output_dir, "training_args.bin"))

def train():
    model_args, data_args, training_args = parse_args()
    accelerator = Accelerator()

    if "chatglm" in model_args.pretrain_path:
        print(model_args.pretrain_path)
        model = AutoModel.from_pretrained(model_args.pretrain_path, trust_remote_code=True, empty_init=False)
    else:

        model = AutoModelForCausalLM.from_pretrained(model_args.pretrain_path,
        load_in_4bit=True,
        device_map='auto',
        torch_dtype=torch.bfloat16,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4'
        ),)
    model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
    target_modules = find_all_linear_names(model)
    # 初始化lora配置
    config = LoraConfig(
        r=64,
        lora_alpha=16,
        target_modules=target_modules,
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_2_SEQ_LM ",        #"CAUSAL_LM",
    )
    model = get_peft_model(model, config)
    model.print_trainable_parameters()
    #model.config.torch_dtype = torch.float32

    print(model_args, data_args, training_args)
    dataset = Nl2SqlJsonlDataset(
        pretrain_path=model_args.pretrain_path,
        train_file_path=data_args.train_file,
        validation_file_path=data_args.validation_file,
        max_seg_length=data_args.max_seq_length,
        pad_to_max_length=data_args.pad_to_max_length,
    )
    dataset.setup()

    # Tell Trainer not to attempt DataParallel
    model.is_parallelizable = True
    model.model_parallel = True

#    model, train_dataloader, eval_dataloader,  optimizer, lr_scheduler = accelerator.prepare(
#        model, train_dataloader, eval_dataloader, optimizer, lr_scheduler
 #   )

    trainer = LoRATrainer(
        model=model,
        args=training_args,
        train_dataset=dataset.train_dataset,
        eval_dataset=dataset.val_dataset,
        data_collator=dataset.collate_fn,
    )
    model.config.use_cache = False

    trainer.train()
    trainer.save_state()
    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)

if __name__ == "__main__":
    train()

artidoro / qlora

multi-gpu get error:Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:7 #156