Closed jacklanda closed 4 months ago
Any thoughts on this?
@pacman100
This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.
Please note that issues that do not follow the contributing guidelines are likely to be ignored.
π€
Sorry @jacklanda, I think @muellerzr and @SunMarc will replace @pacman100 on such issues! If one of you can have a look!
Are there any thoughts on it?
At this time we do not support multiple models with deepspeed
, please see: https://github.com/huggingface/accelerate/issues/2496
At this time we do not support multiple models with
deepspeed
, please see: huggingface/accelerate#2496
I see. Thanks for your message :)
System Info
Who can help?
@ArthurZucker @younesbelkada @pacman100
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
Error Messages
Dataset
I used the following data examples for training and validation in this error reproduction procedure. Please download and use the command
unzip dataset.zip
to decompress it. dataset.zipSteps for Reproduction
Create DeepSpeed config
deepspeed_config_zero3_without_offload.json
shown as the following:Use DeepSpeed to run a training script
test.py
that imports models with statements likeAutoModel.from_pretrained(...)
in Hugging Face.import os import sys import json import glob import logging import argparse import warnings from typing import List, Dict, Optional
import torch import transformers from evaluate import load from datasets import load_dataset from sentence_transformers import SentenceTransformer, util from transformers import ( LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer, ) from transformers.tokenization_utils_base import BatchEncoding
warnings.filterwarnings("ignore")
TOKENIZER = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") TOKENIZER.pad_token_id = 0 TOKENIZER.bos_token_id = 1 TOKENIZER.eos_token_id = 2
BERT_SCORER = load("bertscore")
def preprocess_logits_for_metrics(logits, labels): """ Original Trainer may cause OOM issue. This is a workaround to avoid storing too many tensors that are not needed. """ pred_ids = torch.argmax(logits, dim=-1) return pred_ids, labels
def compute_metrics(eval_preds): """Compute metrics for evaluation.""" pred_ids = eval_preds.predictions[0] labels_ids = eval_preds.label_ids if isinstance(pred_ids, tuple): pred_ids = pred_ids[0]
def get_logger(logger_name: str, output_dir: str) -> logging.Logger: """Initialize logger.""" logger = logging.getLogger(logger_name) logger.setLevel(logging.DEBUG) os.makedirs(output_dir, exist_ok=True) file_handler = logging.FileHandler( os.path.join(output_dir, "log.txt"), mode="w") file_handler.setLevel(logging.INFO) file_handler.setFormatter( logging.Formatter( fmt="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) ) logger.addHandler(file_handler) console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter( logging.Formatter( fmt="%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) ) logger.addHandler(console_handler)
def train(args: argparse.Namespace) -> None: """Training entry for supervised fine-tuning.""" model_config = { "batch_size": 128, "num_epochs": 5, "per_device_train_batch_size": 32, "eval_times": 10, "warmup_rate": 0.06, "gradient_accumulation_steps": 1, } model_type = "llama" model_name_or_path = "meta-llama/Llama-2-7b-chat-hf" data_path_train = "./train.jsonl" data_path_valid = "./valid.jsonl" output_dir = "./output" max_seq_len = 128
if name == "main": parser = argparse.ArgumentParser() parser.add_argument("--deepspeed", type=str, help="deepspeed config") parser.add_argument( "--resume_from_checkpoint", action="store_true", help="either training checkpoint or final adapter", ) parser.add_argument("--local_rank", type=int) args = parser.parse_args()
Expected behavior
It should allow me to compute the BERT Score using a GPU device in the interval training procedure, rather than throwing an error message.