Add LoRA support for Llama models for sequence classification task

@BenjaminBossan Sorry for the delayed response.

Here's the code used for training. The code is taken from transformers examples and modified to add lora layers on top of base model.

Command used to run python -m torch.distributed.launch --nproc_per_node 1 examples/pytorch/text-classification/run_glue.py --do_train --do_eval --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --evaluation_strategy epoch --num_train_epochs 3 --save_strategy epoch --fp16 true --deepspeed tests/deepspeed/ds_config_zero3.json --metric_for_best_model loss --output_dir outputs --eval_accumulation_steps 1

import logging
import os
import json

os.system("pip install --upgrade pytest")

import random
import sys
import warnings
from dataclasses import dataclass, field
from typing import Optional

os.system("pip install --upgrade datasets")
import datasets
import evaluate
import numpy as np
from datasets import load_dataset

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

os.system("pip install --upgrade peft")
from peft import get_peft_model, LoraConfig, TaskType

require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

logger = logging.getLogger(__name__)

@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    task_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
    )
    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": (
                "Whether to pad all samples to `max_seq_length`. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the training data."}
    )
    validation_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the validation data."}
    )
    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})

    def __post_init__(self):
        if self.task_name is not None:
            self.task_name = self.task_name.lower()
            if self.task_name not in task_to_keys.keys():
                raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
        elif self.dataset_name is not None:
            pass
        elif self.train_file is None or self.validation_file is None:
            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
        else:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "json", "jsonl"], "`train_file` should be a csv or a json file."
            validation_extension = self.validation_file.split(".")[-1]
            assert (
                validation_extension == train_extension
            ), "`validation_file` should have the same extension (csv or json) as `train_file`."

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    token: str = field(
        default=None,
        metadata={
            "help": (
                "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
                "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
            )
        },
    )
    use_auth_token: bool = field(
        default=None,
        metadata={
            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
        },
    )
    trust_remote_code: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
                "should only be set to `True` for repositories you trust and in which you have read the code, as it will"
                "execute code present on the Hub on your local machine."
            )
        },
    )
    ignore_mismatched_sizes: bool = field(
        default=False,
        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
    )
    lora_r: int = field(
        default=8,
        metadata={
            "help": "lora r"
        }
    )
    lora_alpha: int = field(
        default=128,
        metadata={
            "help": "lora alpha"
        }
    )
    lora_dropout: float = field(
        default=0.0,
        metadata={
            "help": "lora_dropout"
        }
    )
    fan_in_fan_out: bool = field(
        default=False,
        metadata={
            "help": "fan_in_fan_out"
        }
    )

def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # update the model name or path
    model_selector_args_path = os.path.join(model_args.model_name_or_path, "model_selector_args.json")
    with open(model_selector_args_path, "r") as rptr:
        model_selector_args = json.load(rptr)
        model_name = model_selector_args.get("model_name")
    model_args.model_name_or_path = os.path.join(model_args.model_name_or_path, model_name)

    if model_args.use_auth_token is not None:
        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_glue", model_args, data_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    if training_args.should_log:
        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
        transformers.utils.logging.set_verbosity_info()

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
    # label if at least two columns are provided.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            "glue",
            data_args.task_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
        )
    elif data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
        )
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
        data_files = {"train": data_args.train_file, "validation": data_args.validation_file}

        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
        # when you use `do_predict` without specifying a GLUE benchmark task.
        if training_args.do_predict:
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
                assert (
                    test_extension == train_extension
                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                data_files["test"] = data_args.test_file
            else:
                raise ValueError("Need either a GLUE task or a test file for `do_predict`.")

        for key in data_files.keys():
            logger.info(f"load a local file for {key}: {data_files[key]}")

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
            raw_datasets = load_dataset(
                "csv",
                data_files=data_files,
                cache_dir=model_args.cache_dir,
                token=model_args.token,
            )
        else:
            # Loading a dataset from local json files
            raw_datasets = load_dataset(
                "json",
                data_files=data_files,
                cache_dir=model_args.cache_dir,
                token=model_args.token,
            )
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
            label_list = raw_datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = raw_datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        token=model_args.token,
        trust_remote_code=model_args.trust_remote_code,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        token=model_args.token,
        trust_remote_code=model_args.trust_remote_code,
    )
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        token=model_args.token,
        trust_remote_code=model_args.trust_remote_code,
        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
    )
    # apply peft
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=model_args.lora_r,
        lora_alpha=model_args.lora_alpha,
        lora_dropout=model_args.lora_dropout,
        fan_in_fan_out=model_args.fan_in_fan_out
        # modules_to_save=new_initialized_modules,
        # target_modules=target_modules
    )
    model = get_peft_model(model, lora_config)

    # Preprocessing the raw_datasets
    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            if len(non_label_column_names) >= 2:
                sentence1_key, sentence2_key = non_label_column_names[:2]
            else:
                sentence1_key, sentence2_key = non_label_column_names[0], None

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if (
        model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
        and data_args.task_name is not None
        and not is_regression
    ):
        # Some have all caps in their config, some don't.
        label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
        if sorted(label_name_to_id.keys()) == sorted(label_list):
            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
        else:
            logger.warning(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
                "\nIgnoring the model labels as a result.",
            )
    elif data_args.task_name is None and not is_regression:
        label_to_id = {v: i for i, v in enumerate(label_list)}

    if label_to_id is not None:
        model.config.label2id = label_to_id
        model.config.id2label = {id: label for label, id in config.label2id.items()}
    elif data_args.task_name is not None and not is_regression:
        model.config.label2id = {l: i for i, l in enumerate(label_list)}
        model.config.id2label = {id: label for label, id in config.label2id.items()}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
        return result

    with training_args.main_process_first(desc="dataset map pre-processing"):
        raw_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )
    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))

    if training_args.do_eval:
        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))

    if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
        if data_args.max_predict_samples is not None:
            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
            predict_dataset = predict_dataset.select(range(max_predict_samples))

    # Log a few random samples from the training set:
    if training_args.do_train:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # Get the metric function
    if data_args.task_name is not None:
        metric = evaluate.load("glue", data_args.task_name)
    elif is_regression:
        metric = evaluate.load("mse")
    else:
        metric = evaluate.load("accuracy")

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
        result = metric.compute(predictions=preds, references=p.label_ids)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result

    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
    # we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            valid_mm_dataset = raw_datasets["validation_mismatched"]
            if data_args.max_eval_samples is not None:
                max_eval_samples = min(len(valid_mm_dataset), data_args.max_eval_samples)
                valid_mm_dataset = valid_mm_dataset.select(range(max_eval_samples))
            eval_datasets.append(valid_mm_dataset)
            combined = {}

        for eval_dataset, task in zip(eval_datasets, tasks):
            metrics = trainer.evaluate(eval_dataset=eval_dataset)

            max_eval_samples = (
                data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
            )
            metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

            if task == "mnli-mm":
                metrics = {k + "_mm": v for k, v in metrics.items()}
            if task is not None and "mnli" in task:
                combined.update(metrics)

            trainer.log_metrics("eval", metrics)
            trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        predict_datasets = [predict_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            predict_datasets.append(raw_datasets["test_mismatched"])

        for predict_dataset, task in zip(predict_datasets, tasks):
            # Removing the `label` columns because it contains -1 and Trainer won't like that.
            predict_dataset = predict_dataset.remove_columns("label")
            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
            predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)

            output_predict_file = os.path.join(training_args.output_dir, f"predict_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_predict_file, "w") as writer:
                    logger.info(f"***** Predict results {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = label_list[item]
                            writer.write(f"{index}\t{item}\n")

    kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-classification"}
    if data_args.task_name is not None:
        kwargs["language"] = "en"
        kwargs["dataset_tags"] = "glue"
        kwargs["dataset_args"] = data_args.task_name
        kwargs["dataset"] = f"GLUE {data_args.task_name.upper()}"

    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)

def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()

if __name__ == "__main__":
    main()

Traceback

[2023-09-19 09:54:57,086] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-09-19 09:54:57,086] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-09-19 09:54:57,086] [INFO] [comm.py:625:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
09/19/2023 09:54:57 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True
09/19/2023 09:54:57 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=tests/deepspeed/ds_config_zero3.json,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=1,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=5e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=outputs/runs/Sep19_09-54-57_f5adbc1a15d5430d990ae2dd9a192ddb000001,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=500,
logging_strategy=steps,
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=loss,
mp_parameters=,
no_cuda=False,
num_train_epochs=3.0,
optim=adamw_hf,
optim_args=None,
output_dir=outputs,
overwrite_output_dir=False,
past_index=-1,
per_device_eval_batch_size=1,
per_device_train_batch_size=1,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=['mlflow'],
resume_from_checkpoint=None,
run_name=outputs,
save_on_each_node=False,
save_safetensors=False,
save_steps=500,
save_strategy=epoch,
save_total_limit=None,
seed=42,
sharded_ddp=[],
skip_memory_metrics=True,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
warmup_ratio=0.0,
warmup_steps=0,
weight_decay=0.0,
xpu_backend=None,
)
09/19/2023 09:54:57 - INFO - __main__ - load a local file for train: /mnt/azureml/cr/j/edcc6a3391544d68a3d12c4dbe6d3944/cap/data-capability/wd/INPUT_train_file_path/validation.jsonl
09/19/2023 09:54:57 - INFO - __main__ - load a local file for validation: /mnt/azureml/cr/j/edcc6a3391544d68a3d12c4dbe6d3944/cap/data-capability/wd/INPUT_validation_file_path/validation.jsonl
Using custom data configuration default-35e994c99d798607
09/19/2023 09:54:58 - INFO - datasets.builder - Using custom data configuration default-35e994c99d798607
Loading Dataset Infos from /opt/conda/envs/ptca/lib/python3.8/site-packages/datasets/packaged_modules/json
09/19/2023 09:54:58 - INFO - datasets.info - Loading Dataset Infos from /opt/conda/envs/ptca/lib/python3.8/site-packages/datasets/packaged_modules/json
Generating dataset json (/root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
09/19/2023 09:54:58 - INFO - datasets.builder - Generating dataset json (/root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...
09/19/2023 09:54:58 - INFO - datasets.builder - Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]
Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 11586.48it/s]
Downloading took 0.0 min
09/19/2023 09:54:58 - INFO - datasets.download.download_manager - Downloading took 0.0 min
Checksum Computation took 0.0 min
09/19/2023 09:54:58 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 69.66it/s]
Generating train split
09/19/2023 09:54:58 - INFO - datasets.builder - Generating train split

Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 2000 examples [00:00, 207685.08 examples/s]
Generating validation split
09/19/2023 09:54:58 - INFO - datasets.builder - Generating validation split

Generating validation split: 0 examples [00:00, ? examples/s]
Generating validation split: 2000 examples [00:00, 447631.16 examples/s]
Unable to verify splits sizes.
09/19/2023 09:54:58 - INFO - datasets.utils.info_utils - Unable to verify splits sizes.
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.
09/19/2023 09:54:58 - INFO - datasets.builder - Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.
[INFO|configuration_utils.py:710] 2023-09-19 09:54:58,125 >> loading configuration file /mnt/azureml/cr/j/edcc6a3391544d68a3d12c4dbe6d3944/cap/data-capability/wd/INPUT_model_selector_output/default_model_name/config.json
[INFO|configuration_utils.py:768] 2023-09-19 09:54:58,126 >> Model config LlamaConfig {
  "_name_or_path": "/mnt/azureml/cr/j/edcc6a3391544d68a3d12c4dbe6d3944/cap/data-capability/wd/INPUT_model_selector_output/default_model_name",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.31.0",
  "use_cache": true,
  "vocab_size": 32000
}

[INFO|tokenization_utils_base.py:1837] 2023-09-19 09:54:58,156 >> loading file tokenizer.model
[INFO|tokenization_utils_base.py:1837] 2023-09-19 09:54:58,157 >> loading file tokenizer.json
[INFO|tokenization_utils_base.py:1837] 2023-09-19 09:54:58,157 >> loading file added_tokens.json
[INFO|tokenization_utils_base.py:1837] 2023-09-19 09:54:58,157 >> loading file special_tokens_map.json
[INFO|tokenization_utils_base.py:1837] 2023-09-19 09:54:58,157 >> loading file tokenizer_config.json
[INFO|modeling_utils.py:2600] 2023-09-19 09:54:58,348 >> loading weights file /mnt/azureml/cr/j/edcc6a3391544d68a3d12c4dbe6d3944/cap/data-capability/wd/INPUT_model_selector_output/default_model_name/model.safetensors.index.json
[INFO|modeling_utils.py:2694] 2023-09-19 09:54:58,368 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:50 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:50 [0] NCCL INFO Bootstrap : Using eth0:10.0.0.5<0>
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:50 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol.
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:50 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5).
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:50 [0] NCCL INFO cudaDriverVersion 12020
NCCL version 2.17.1+cuda11.7
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Plugin Path : /opt/nccl-rdma-sharp-plugins/lib/libnccl-net.so
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO P2P plugin IBext
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO NET/IB : Using [0]mlx5_ib0:1/IB [RO]; OOB eth0:10.0.0.5<0>
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Using network IBext
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 00/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 01/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 02/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 03/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 04/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 05/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 06/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 07/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 08/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 09/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 10/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 11/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 12/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 13/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 14/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 15/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 16/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 17/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 18/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 19/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 20/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 21/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 22/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 23/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 24/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 25/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 26/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 27/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 28/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 29/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 30/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Channel 31/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO P2P Chunksize set to 131072
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Connected all rings
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO Connected all trees
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO MSCCL: No external scheduler found, using internal implementation
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO MSCCL: Initialization finished
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:178 [0] NCCL INFO comm 0x567f3f30 rank 0 nranks 1 cudaDev 0 busId 100000 commId 0x4ee5a1a2f8b48412 - Init COMPLETE
[2023-09-19 09:55:08,440] [INFO] [partition_parameters.py:453:__exit__] finished initializing model with 6.61B parameters

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]
Loading checkpoint shards:  50%|█████     | 1/2 [00:18<00:18, 18.39s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:24<00:00, 11.21s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:24<00:00, 12.29s/it]
[INFO|modeling_utils.py:3319] 2023-09-19 09:55:33,068 >> Some weights of the model checkpoint at /mnt/azureml/cr/j/edcc6a3391544d68a3d12c4dbe6d3944/cap/data-capability/wd/INPUT_model_selector_output/default_model_name were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[INFO|modeling_utils.py:3337] 2023-09-19 09:55:33,068 >> All the weights of LlamaForSequenceClassification were initialized from the model checkpoint at /mnt/azureml/cr/j/edcc6a3391544d68a3d12c4dbe6d3944/cap/data-capability/wd/INPUT_model_selector_output/default_model_name.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForSequenceClassification for predictions without further training.

Running tokenizer on dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]Caching processed dataset at /root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-0afc80cb8fa30c8e.arrow
09/19/2023 09:55:45 - INFO - datasets.arrow_dataset - Caching processed dataset at /root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-0afc80cb8fa30c8e.arrow

Running tokenizer on dataset: 100%|██████████| 2000/2000 [00:00<00:00, 17869.01 examples/s]
Running tokenizer on dataset: 100%|██████████| 2000/2000 [00:00<00:00, 17511.58 examples/s]

Running tokenizer on dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]Caching processed dataset at /root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-2da2f6cc0f60af31.arrow
09/19/2023 09:55:45 - INFO - datasets.arrow_dataset - Caching processed dataset at /root/.cache/huggingface/datasets/json/default-35e994c99d798607/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-2da2f6cc0f60af31.arrow

Running tokenizer on dataset: 100%|██████████| 2000/2000 [00:00<00:00, 23065.21 examples/s]09/19/2023 09:55:45 - INFO - __main__ - Sample 1309 of the training set: {'text': 'i mean obviously yes i did a hour round trip to perform for minutes and had a seriously dodgy chinese meal which has left me feeling decidedly delicate but overall i really enjoyed myself', 'label': 2, 'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 474, 2099, 12879, 4874, 474, 1258, 263, 7234, 4513, 17487, 304, 2189, 363, 6233, 322, 750, 263, 25798, 21130, 1927, 521, 8233, 592, 284, 607, 756, 2175, 592, 11223, 8459, 368, 628, 9593, 541, 12463, 474, 2289, 27849, 6142], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.
09/19/2023 09:55:45 - INFO - __main__ - Sample 228 of the training set: {'text': 'i began feeling shaky my heart was sort of skipping around i felt like someone who had been drinking coffee all day long', 'label': 4, 'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 474, 4689, 11223, 528, 557, 29891, 590, 5192, 471, 2656, 310, 14993, 3262, 2820, 474, 7091, 763, 4856, 1058, 750, 1063, 13748, 292, 26935, 599, 2462, 1472], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.
09/19/2023 09:55:45 - INFO - __main__ - Sample 51 of the training set: {'text': 'i have a feeling hes going to be way more successful than i am', 'label': 1, 'input_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 474, 505, 263, 11223, 19066, 2675, 304, 367, 982, 901, 9150, 1135, 474, 626], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}.

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]
Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<00:00, 23.8MB/s]
[INFO|trainer.py:763] 2023-09-19 09:55:45,821 >> The following columns in the training set don't have a corresponding argument in `PeftModelForSequenceClassification.forward` and have been ignored: text. If text are not expected by `PeftModelForSequenceClassification.forward`,  you can safely ignore this message.
[2023-09-19 09:55:45,828] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.9.5, git-hash=unknown, git-branch=unknown
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Using network IBext
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Setting affinity for GPU 0 to 0fffff
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 00/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 01/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 02/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 03/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 04/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 05/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 06/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 07/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 08/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 09/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 10/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 11/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 12/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 13/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 14/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 15/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 16/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 17/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 18/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 19/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 20/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 21/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 22/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 23/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 24/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 25/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 26/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 27/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 28/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 29/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 30/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Channel 31/32 :    0
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Trees [0] -1/-1/-1->0->-1 [1] -1/-1/-1->0->-1 [2] -1/-1/-1->0->-1 [3] -1/-1/-1->0->-1 [4] -1/-1/-1->0->-1 [5] -1/-1/-1->0->-1 [6] -1/-1/-1->0->-1 [7] -1/-1/-1->0->-1 [8] -1/-1/-1->0->-1 [9] -1/-1/-1->0->-1 [10] -1/-1/-1->0->-1 [11] -1/-1/-1->0->-1 [12] -1/-1/-1->0->-1 [13] -1/-1/-1->0->-1 [14] -1/-1/-1->0->-1 [15] -1/-1/-1->0->-1 [16] -1/-1/-1->0->-1 [17] -1/-1/-1->0->-1 [18] -1/-1/-1->0->-1 [19] -1/-1/-1->0->-1 [20] -1/-1/-1->0->-1 [21] -1/-1/-1->0->-1 [22] -1/-1/-1->0->-1 [23] -1/-1/-1->0->-1 [24] -1/-1/-1->0->-1 [25] -1/-1/-1->0->-1 [26] -1/-1/-1->0->-1 [27] -1/-1/-1->0->-1 [28] -1/-1/-1->0->-1 [29] -1/-1/-1->0->-1 [30] -1/-1/-1->0->-1 [31] -1/-1/-1->0->-1
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO P2P Chunksize set to 131072
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Connected all rings
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO Connected all trees
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO 32 coll channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer
[2023-09-19 09:55:45,937] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
f5adbc1a15d5430d990ae2dd9a192ddb000001:50:263 [0] NCCL INFO comm 0x57077530 rank 0 nranks 1 cudaDev 0 busId 100000 commId 0x76a0036aa1e0d355 - Init COMPLETE
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py38_cu117/cpu_adam...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py38_cu117/cpu_adam/build.ninja...
Building extension module cpu_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
    - Avoid using `tokenizers` before the fork if possible
    - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[1/3] /usr/local/cuda/bin/nvcc  -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/opt/conda/envs/ptca/lib/python3.8/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=compute_70 -gencode=arch=compute_70,code=sm_70 --compiler-options '-fPIC' -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70 -c /opt/conda/envs/ptca/lib/python3.8/site-packages/deepspeed/ops/csrc/common/custom_cuda_kernel.cu -o custom_cuda_kernel.cuda.o 
[2/3] c++ -MMD -MF cpu_adam.o.d -DTORCH_EXTENSION_NAME=cpu_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -I/opt/conda/envs/ptca/lib/python3.8/site-packages/deepspeed/ops/csrc/includes -I/usr/local/cuda/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/TH -isystem /opt/conda/envs/ptca/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/cuda/include -isystem /opt/conda/envs/ptca/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++17 -O3 -std=c++17 -g -Wno-reorder -L/usr/local/cuda/lib64 -lcudart -lcublas -g -march=native -fopenmp -D__AVX512__ -D__ENABLE_CUDA__ -c /opt/conda/envs/ptca/lib/python3.8/site-packages/deepspeed/ops/csrc/adam/cpu_adam.cpp -o cpu_adam.o 
[3/3] c++ cpu_adam.o custom_cuda_kernel.cuda.o -shared -lcurand -L/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/lib -lc10 -lc10_cuda -ltorch_cpu -ltorch_cuda -ltorch -ltorch_python -L/usr/local/cuda/lib64 -lcudart -o cpu_adam.so
Time to load cpu_adam op: 33.48447632789612 seconds
[2023-09-19 09:56:21,653] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
[2023-09-19 09:56:21,664] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam
[2023-09-19 09:56:21,664] [INFO] [utils.py:54:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type=<class 'deepspeed.ops.adam.cpu_adam.DeepSpeedCPUAdam'>
[2023-09-19 09:56:21,665] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False
[2023-09-19 09:56:21,665] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 3 optimizer
Adam Optimizer #0 is created with AVX512 arithmetic capability.
Config: alpha=0.000050, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1
[2023-09-19 09:56:21,923] [INFO] [utils.py:785:see_memory_usage] Stage 3 initialize beginning
[2023-09-19 09:56:21,924] [INFO] [utils.py:786:see_memory_usage] MA 0.07 GB         Max_MA 0.73 GB         CA 0.32 GB         Max_CA 1 GB 
[2023-09-19 09:56:21,924] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.05 GB, percent = 4.4%
[2023-09-19 09:56:21,927] [INFO] [stage3.py:113:__init__] Reduce bucket size 16777216
[2023-09-19 09:56:21,927] [INFO] [stage3.py:114:__init__] Prefetch bucket size 15099494
[2023-09-19 09:56:22,092] [INFO] [utils.py:785:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
[2023-09-19 09:56:22,092] [INFO] [utils.py:786:see_memory_usage] MA 0.07 GB         Max_MA 0.07 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:22,093] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.05 GB, percent = 4.4%
Parameter Offload: Total persistent parameters: 4485120 in 195 params
[2023-09-19 09:56:22,319] [INFO] [utils.py:785:see_memory_usage] DeepSpeedZeRoOffload initialize [end]
[2023-09-19 09:56:22,320] [INFO] [utils.py:786:see_memory_usage] MA 0.07 GB         Max_MA 0.07 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:22,320] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.06 GB, percent = 4.4%
[2023-09-19 09:56:22,481] [INFO] [utils.py:785:see_memory_usage] Before creating fp16 partitions
[2023-09-19 09:56:22,481] [INFO] [utils.py:786:see_memory_usage] MA 0.07 GB         Max_MA 0.07 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:22,482] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.06 GB, percent = 4.4%
[2023-09-19 09:56:22,656] [INFO] [utils.py:785:see_memory_usage] After creating fp16 partitions: 1
[2023-09-19 09:56:22,656] [INFO] [utils.py:786:see_memory_usage] MA 0.06 GB         Max_MA 0.07 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:22,657] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.07 GB, percent = 4.4%
[2023-09-19 09:56:22,810] [INFO] [utils.py:785:see_memory_usage] Before creating fp32 partitions
[2023-09-19 09:56:22,811] [INFO] [utils.py:786:see_memory_usage] MA 0.06 GB         Max_MA 0.06 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:22,811] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.07 GB, percent = 4.4%
[2023-09-19 09:56:22,986] [INFO] [utils.py:785:see_memory_usage] After creating fp32 partitions
[2023-09-19 09:56:22,987] [INFO] [utils.py:786:see_memory_usage] MA 0.06 GB         Max_MA 0.06 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:22,987] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.12 GB, percent = 4.4%
[2023-09-19 09:56:23,149] [INFO] [utils.py:785:see_memory_usage] Before initializing optimizer states
[2023-09-19 09:56:23,150] [INFO] [utils.py:786:see_memory_usage] MA 0.06 GB         Max_MA 0.06 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:23,150] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.12 GB, percent = 4.4%
[2023-09-19 09:56:23,359] [INFO] [utils.py:785:see_memory_usage] After initializing optimizer states
[2023-09-19 09:56:23,360] [INFO] [utils.py:786:see_memory_usage] MA 0.06 GB         Max_MA 0.06 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:23,360] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.19 GB, percent = 4.4%
[2023-09-19 09:56:23,360] [INFO] [stage3.py:387:_setup_for_real_optimizer] optimizer state initialized
[2023-09-19 09:56:23,646] [INFO] [utils.py:785:see_memory_usage] After initializing ZeRO optimizer
[2023-09-19 09:56:23,647] [INFO] [utils.py:786:see_memory_usage] MA 0.09 GB         Max_MA 0.09 GB         CA 0.32 GB         Max_CA 0 GB 
[2023-09-19 09:56:23,647] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory:  used = 29.19 GB, percent = 4.4%
[2023-09-19 09:56:23,647] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw
[2023-09-19 09:56:23,647] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = WarmupLR
[2023-09-19 09:56:23,647] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <deepspeed.runtime.lr_schedules.WarmupLR object at 0x149ab9770cd0>
[2023-09-19 09:56:23,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05], mom=[[0.9, 0.999]]
[2023-09-19 09:56:23,649] [INFO] [config.py:960:print] DeepSpeedEngine configuration:
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   activation_checkpointing_config  {
    "partition_activations": false, 
    "contiguous_memory_optimization": false, 
    "cpu_checkpointing": false, 
    "number_checkpoints": null, 
    "synchronize_checkpoint_boundary": false, 
    "profile": false
}
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   amp_enabled .................. False
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   amp_params ................... False
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   autotuning_config ............ {
    "enabled": false, 
    "start_step": null, 
    "end_step": null, 
    "metric_path": null, 
    "arg_mappings": null, 
    "metric": "throughput", 
    "model_info": null, 
    "results_dir": "autotuning_results", 
    "exps_dir": "autotuning_exps", 
    "overwrite": true, 
    "fast": true, 
    "start_profile_step": 3, 
    "end_profile_step": 5, 
    "tuner_type": "gridsearch", 
    "tuner_early_stopping": 5, 
    "tuner_num_trials": 50, 
    "model_info_path": null, 
    "mp_size": 1, 
    "max_train_batch_size": null, 
    "min_train_batch_size": 1, 
    "max_train_micro_batch_size_per_gpu": 1.024000e+03, 
    "min_train_micro_batch_size_per_gpu": 1, 
    "num_tuning_micro_batch_sizes": 3
}
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   bfloat16_enabled ............. False
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   checkpoint_parallel_write_pipeline  False
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   checkpoint_tag_validation_enabled  True
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   checkpoint_tag_validation_fail  False
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x149bfd6c97c0>
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   communication_data_type ...... None
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   curriculum_enabled_legacy .... False
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   curriculum_params_legacy ..... False
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
[2023-09-19 09:56:23,650] [INFO] [config.py:964:print]   data_efficiency_enabled ...... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   dataloader_drop_last ......... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   disable_allgather ............ False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   dump_state ................... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1}
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   eigenvalue_enabled ........... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   eigenvalue_gas_boundary_resolution  1
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   eigenvalue_layer_name ........ bert.encoder.layer
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   eigenvalue_layer_num ......... 0
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   eigenvalue_max_iter .......... 100
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   eigenvalue_stability ......... 1e-06
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   eigenvalue_tol ............... 0.01
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   eigenvalue_verbose ........... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   elasticity_enabled ........... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   flops_profiler_config ........ {
    "enabled": false, 
    "recompute_fwd_factor": 0.0, 
    "profile_step": 1, 
    "module_depth": -1, 
    "top_modules": 1, 
    "detailed": true, 
    "output_file": null
}
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   fp16_auto_cast ............... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   fp16_enabled ................. True
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   fp16_master_weights_and_gradients  False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   global_rank .................. 0
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   grad_accum_dtype ............. None
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   gradient_accumulation_steps .. 1
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   gradient_clipping ............ 1.0
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   gradient_predivide_factor .... 1.0
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   initial_dynamic_scale ........ 65536
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   load_universal_checkpoint .... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   loss_scale ................... 0
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   memory_breakdown ............. False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   mics_hierarchial_params_gather  False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   mics_shard_size .............. -1
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   nebula_config ................ {
    "enabled": false, 
    "persistent_storage_path": null, 
    "persistent_time_interval": 100, 
    "num_of_version_in_retention": 2, 
    "enable_nebula_load": true, 
    "load_path": null
}
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   optimizer_legacy_fusion ...... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   optimizer_name ............... adamw
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   optimizer_params ............. {'lr': 5e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.0}
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   pld_enabled .................. False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   pld_params ................... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   prescale_gradients ........... False
[2023-09-19 09:56:23,651] [INFO] [config.py:964:print]   scheduler_name ............... WarmupLR
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   scheduler_params ............. {'warmup_min_lr': 0, 'warmup_max_lr': 5e-05, 'warmup_num_steps': 0}
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   sparse_attention ............. None
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   sparse_gradients_enabled ..... False
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   steps_per_print .............. inf
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   train_batch_size ............. 1
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   train_micro_batch_size_per_gpu  1
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   use_node_local_storage ....... False
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   wall_clock_breakdown ......... False
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   world_size ................... 1
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   zero_allow_untested_optimizer  False
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=16777216 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='cpu', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=True) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=15099494 param_persistence_threshold=40960 model_persistence_threshold=sys.maxsize max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   zero_enabled ................. True
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   zero_force_ds_cpu_optimizer .. True
[2023-09-19 09:56:23,652] [INFO] [config.py:964:print]   zero_optimization_stage ...... 3
[2023-09-19 09:56:23,652] [INFO] [config.py:950:print_user_config]   json = {
    "fp16": {
        "enabled": true, 
        "loss_scale": 0, 
        "loss_scale_window": 1000, 
        "initial_scale_power": 16, 
        "hysteresis": 2, 
        "min_loss_scale": 1
    }, 
    "bf16": {
        "enabled": false
    }, 
    "optimizer": {
        "type": "AdamW", 
        "params": {
            "lr": 5e-05, 
            "betas": [0.9, 0.999], 
            "eps": 1e-08, 
            "weight_decay": 0.0
        }
    }, 
    "scheduler": {
        "type": "WarmupLR", 
        "params": {
            "warmup_min_lr": 0, 
            "warmup_max_lr": 5e-05, 
            "warmup_num_steps": 0
        }
    }, 
    "zero_optimization": {
        "stage": 3, 
        "offload_optimizer": {
            "device": "cpu", 
            "pin_memory": true
        }, 
        "offload_param": {
            "device": "cpu", 
            "pin_memory": true
        }, 
        "overlap_comm": true, 
        "contiguous_gradients": true, 
        "sub_group_size": 1.000000e+09, 
        "reduce_bucket_size": 1.677722e+07, 
        "stage3_prefetch_bucket_size": 1.509949e+07, 
        "stage3_param_persistence_threshold": 4.096000e+04, 
        "stage3_max_live_parameters": 1.000000e+09, 
        "stage3_max_reuse_distance": 1.000000e+09, 
        "stage3_gather_16bit_weights_on_model_save": true
    }, 
    "gradient_accumulation_steps": 1, 
    "gradient_clipping": 1.0, 
    "steps_per_print": inf, 
    "train_batch_size": 1, 
    "train_micro_batch_size_per_gpu": 1, 
    "wall_clock_breakdown": false
}
Loading extension module cpu_adam...
[INFO|trainer.py:1686] 2023-09-19 09:56:23,652 >> ***** Running training *****
[INFO|trainer.py:1687] 2023-09-19 09:56:23,652 >>   Num examples = 2,000
[INFO|trainer.py:1688] 2023-09-19 09:56:23,652 >>   Num Epochs = 3
[INFO|trainer.py:1689] 2023-09-19 09:56:23,652 >>   Instantaneous batch size per device = 1
[INFO|trainer.py:1692] 2023-09-19 09:56:23,652 >>   Total train batch size (w. parallel, distributed & accumulation) = 1
[INFO|trainer.py:1693] 2023-09-19 09:56:23,652 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1694] 2023-09-19 09:56:23,653 >>   Total optimization steps = 6,000
[INFO|trainer.py:1695] 2023-09-19 09:56:23,655 >>   Number of trainable parameters = 4,218,880

  0%|          | 0/6000 [00:00<?, ?it/s]Traceback (most recent call last):
  File "examples/pytorch/text-classification/run_glue.py", line 705, in <module>
    main()
  File "examples/pytorch/text-classification/run_glue.py", line 613, in main
    train_result = trainer.train(resume_from_checkpoint=checkpoint)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/trainer.py", line 1539, in train
    return inner_training_loop(
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/trainer.py", line 1809, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/trainer.py", line 2654, in training_step
    loss = self.compute_loss(model, inputs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/trainer.py", line 2679, in compute_loss
    outputs = model(**inputs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
    ret_val = func(*args, **kwargs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/deepspeed/runtime/engine.py", line 1735, in forward
    loss = self.module(*inputs, **kwargs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/peft/peft_model.py", line 736, in forward
    labels=labels,
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/transformers/models/llama/modeling_llama.py", line 957, in forward
    logits = self.score(hidden_states)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/peft/utils/other.py", line 145, in forward
    # replace possible -100 values in labels by `pad_token_id`
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1538, in _call_impl
    result = forward_call(*args, **kwargs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/deepspeed/runtime/zero/linear.py", line 106, in zero3_linear_wrap
    return LinearFunctionForZeroStage3.apply(input, weight)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/autograd/function.py", line 506, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/torch/cuda/amp/autocast_mode.py", line 98, in decorate_fwd
    return fwd(*args, **kwargs)
  File "/opt/conda/envs/ptca/lib/python3.8/site-packages/deepspeed/runtime/zero/linear.py", line 57, in forward
    output = input.matmul(weight.t())
RuntimeError: size mismatch, got 128, 128x4096,0

  0%|          | 0/6000 [00:02<?, ?it/s]

huggingface / peft

Add LoRA support for Llama models for sequence classification task #877

Feature request

Motivation

Your contribution