+-----------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=============================================================================|
+-----------------------------------------------------------------------------+
Who can help? / 谁可以帮助到您?
No response
Information / 问题信息
[X] The official example scripts / 官方的示例脚本
[ ] My own modified scripts / 我自己修改的脚本和任务
Reproduction / 复现过程
-- coding: utf-8 --
import os
import jieba
import dataclasses as dc
import functools
from collections.abc import Callable, Mapping, Sequence
from pathlib import Path
from typing import Annotated, Any, Union
import numpy as np
import ruamel.yaml as yaml
import torch
import typer
from datasets import Dataset, Split
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from peft import PeftConfig, get_peft_config, get_peft_model
from rouge_chinese import Rouge
from torch import nn
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
EvalPrediction,
GenerationConfig,
PreTrainedTokenizer,
Seq2SeqTrainingArguments,
)
from transformers import DataCollatorForSeq2Seq as _DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer as _Seq2SeqTrainer
from datasets import load_dataset, DatasetDict, NamedSplit
from typing import Optional
class DataCollatorForSeq2Seq(_DataCollatorForSeq2Seq):
def call(self, features, return_tensors=None):
output_ids = ([feature['output_ids'] for feature in features] if 'output_ids' in features[0].keys() else None)
if output_ids is not None:
max_output_length = max(len(out) for out in output_ids)
if self.pad_to_multiple_of is not None:
max_output_length = (
(
max_output_length + self.pad_to_multiple_of - 1) //
self.pad_to_multiple_of self.pad_to_multiple_of
)
for feature in features:
remainder = [self.tokenizer.pad_token_id] (
max_output_length - len(feature['output_ids'])
)
if isinstance(feature['output_ids'], list):
feature['output_ids'] = feature['output_ids'] + remainder
else:
feature['output_ids'] = np.concatenate(
[feature['output_ids'], remainder]
).astype(np.int64)
return super().call(features, return_tensors)
def process_message(message):
if 'tools' in message and message['role'] == 'system':
for tool in message['tools']:
parameters = tool['function']['parameters']['properties']
tool['function']['parameters']['properties'] = \
{k: v for k, v in parameters.items() if
v is not None}
elif 'tools' in message:
del message['tools']
return message
for conv in batched_conv:
input_ids = [151331, 151333]
for message in conv:
if len(input_ids) >= max_input_length:
break
else:
message = process_message(message)
new_input_ids = tokenizer.apply_chat_template([message], tokenize=True, return_dict=False)[2:]
if message['role'] == 'assistant':
output_prompt, output_ids = (
new_input_ids[:1],
new_input_ids[1:],
)
output_ids.append(151336)
batched_input_ids.append(
input_ids[:max_input_length] + output_prompt[:1]
)
batched_output_ids.append(output_ids[:max_output_length])
input_ids += new_input_ids
del batched_conv, conv, input_ids, message, new_input_ids, output_prompt, output_ids
torch.cuda.empty_cache()
return {'input_ids': batched_input_ids, 'output_ids': batched_output_ids}
def load_tokenizer_and_model(
model_dir: str,
peft_config: Optional[PeftConfig] = None,
):
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
if peft_config is not None:
model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
empty_init=False,
use_cache=False,
torch_dtype=torch.bfloat16 # Must use BFloat 16
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
else:
model = AutoModelForCausalLM.from_pretrained(
model_dir,
trust_remote_code=True,
empty_init=False,
use_cache=False,
torch_dtype=torch.bfloat16
)
return tokenizer, model
def compute_metrics(eval_preds: EvalPrediction, tokenizer):
batched_pred_ids, batched_label_ids = eval_preds
metrics_dct = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []}
for pred_ids, label_ids in zip(batched_pred_ids, batched_label_ids):
pred_txt = tokenizer.decode(pred_ids).strip()
label_txt = tokenizer.decode(label_ids).strip()
pred_tokens = list(jieba.cut(pred_txt))
label_tokens = list(jieba.cut(label_txt))
rouge = Rouge()
scores = rouge.get_scores(' '.join(pred_tokens), ' '.join(label_tokens))
for k, v in scores[0].items():
metrics_dct[k].append(round(v['f'] * 100, 4))
metrics_dct['bleu-4'].append(
sentence_bleu([label_tokens], pred_tokens, smoothing_function=SmoothingFunction().method3))
return {k: np.mean(v) for k, v in metrics_dct.items()}
@app.command()
def main(
data_dir: Annotated[str, typer.Argument(help='')],
model_dir: Annotated[
str,
typer.Argument(
help='A string that specifies the model id of a pretrained model configuration hosted on huggingface.co, or a path to a directory containing a model configuration file.'
),
],
config_file: Annotated[str, typer.Argument(help='')],
auto_resume_from_checkpoint: str = typer.Argument(
default='',
help='If entered as yes, automatically use the latest save checkpoint. If it is a numerical example 12 15, use the corresponding save checkpoint. If the input is no, restart training'
),
):
ft_config = FinetuningConfig.from_file(config_file)
tokenizer, model = load_tokenizer_and_model(model_dir, peft_config=ft_config.peft_config)
data_manager = DataManager(data_dir, ft_config.data_config)
train_dataset = data_manager.get_dataset(
Split.TRAIN,
functools.partial(
process_batch,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
print('train_dataset:', train_dataset)
val_dataset = data_manager.get_dataset(
Split.VALIDATION,
functools.partial(
process_batch_eval,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
if val_dataset is not None:
print('val_dataset:', val_dataset)
test_dataset = data_manager.get_dataset(
Split.TEST,
functools.partial(
process_batch_eval,
tokenizer=tokenizer,
max_input_length=ft_config.max_input_length,
max_output_length=ft_config.max_output_length,
),
batched=True,
)
if test_dataset is not None:
print('test_dataset:', test_dataset)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
trainer = Seq2SeqTrainer(
model=model,
args=ft_config.training_args,
data_collator=DataCollatorForSeq2Seq(
tokenizer=tokenizer,
padding='max_length',
return_tensors='pt',
),
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=functools.partial(compute_metrics, tokenizer=tokenizer),
)
if auto_resume_from_checkpoint.upper() == "" or auto_resume_from_checkpoint is None:
trainer.train()
else:
output_dir = ft_config.training_args.output_dir
dirlist = os.listdir(output_dir)
checkpoint_sn = 0
for checkpoint_str in dirlist:
if checkpoint_str.find("eckpoint") > 0 and checkpoint_str.find("tmp") == -1:
checkpoint = int(checkpoint_str.replace("checkpoint-", ""))
if checkpoint > checkpoint_sn:
checkpoint_sn = checkpoint
if auto_resume_from_checkpoint.upper() == "YES":
if checkpoint_sn > 0:
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
print("resume checkpoint from checkpoint-" + str(checkpoint_sn))
trainer.train(resume_from_checkpoint=checkpoint_directory)
else:
trainer.train()
else:
if auto_resume_from_checkpoint.isdigit():
if int(auto_resume_from_checkpoint) > 0:
checkpoint_sn = int(auto_resume_from_checkpoint)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
checkpoint_directory = os.path.join(output_dir, "checkpoint-" + str(checkpoint_sn))
print("resume checkpoint from checkpoint-" + str(checkpoint_sn))
trainer.train(resume_from_checkpoint=checkpoint_directory)
else:
print(auto_resume_from_checkpoint,
"The specified checkpoint sn(" + auto_resume_from_checkpoint + ") has not been saved. Please search for the correct checkpoint in the model output directory")
if test_dataset is not None:
trainer.predict(test_dataset)
System Info / 系統信息
+-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.129.06 Driver Version: 470.129.06 CUDA Version: 12.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA A100-SXM... On | 00000000:65:02.0 Off | 0 | | N/A 33C P0 73W / 400W | 15535MiB / 81251MiB | 20% Default | | | | Disabled | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA A100-SXM... On | 00000000:65:03.0 Off | 0 | | N/A 32C P0 76W / 400W | 14807MiB / 81251MiB | 9% Default | | | | Disabled | +-------------------------------+----------------------+----------------------+ | 2 NVIDIA A100-SXM... On | 00000000:67:02.0 Off | 0 | | N/A 32C P0 74W / 400W | 16701MiB / 81251MiB | 0% Default | | | | Disabled | +-------------------------------+----------------------+----------------------+ | 3 NVIDIA A100-SXM... On | 00000000:67:03.0 Off | 0 | | N/A 33C P0 77W / 400W | 16701MiB / 81251MiB | 0% Default | | | | Disabled | +-------------------------------+----------------------+----------------------+ | 4 NVIDIA A100-SXM... On | 00000000:69:02.0 Off | 0 | | N/A 33C P0 73W / 400W | 24529MiB / 81251MiB | 23% Default | | | | Disabled | +-------------------------------+----------------------+----------------------+ | 5 NVIDIA A100-SXM... On | 00000000:69:03.0 Off | 0 | | N/A 33C P0 78W / 400W | 24507MiB / 81251MiB | 0% Default | | | | Disabled | +-------------------------------+----------------------+----------------------+ | 6 NVIDIA A100-SXM... On | 00000000:6B:02.0 Off | 0 | | N/A 31C P0 72W / 400W | 24495MiB / 81251MiB | 0% Default | | | | Disabled | +-------------------------------+----------------------+----------------------+ | 7 NVIDIA A100-SXM... On | 00000000:6B:03.0 Off | 0 | | N/A 36C P0 77W / 400W | 24493MiB / 81251MiB | 0% Default | | | | Disabled | +-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| +-----------------------------------------------------------------------------+
Who can help? / 谁可以帮助到您?
No response
Information / 问题信息
Reproduction / 复现过程
-- coding: utf-8 --
import os import jieba import dataclasses as dc import functools from collections.abc import Callable, Mapping, Sequence from pathlib import Path from typing import Annotated, Any, Union import numpy as np import ruamel.yaml as yaml import torch import typer from datasets import Dataset, Split from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction from peft import PeftConfig, get_peft_config, get_peft_model from rouge_chinese import Rouge from torch import nn from transformers import ( AutoModelForCausalLM, AutoTokenizer, EvalPrediction, GenerationConfig, PreTrainedTokenizer, Seq2SeqTrainingArguments, ) from transformers import DataCollatorForSeq2Seq as _DataCollatorForSeq2Seq from transformers import Seq2SeqTrainer as _Seq2SeqTrainer from datasets import load_dataset, DatasetDict, NamedSplit from typing import Optional
app = typer.Typer(pretty_exceptions_show_locals=False)
class DataCollatorForSeq2Seq(_DataCollatorForSeq2Seq): def call(self, features, return_tensors=None): output_ids = ([feature['output_ids'] for feature in features] if 'output_ids' in features[0].keys() else None) if output_ids is not None: max_output_length = max(len(out) for out in output_ids) if self.pad_to_multiple_of is not None: max_output_length = ( ( max_output_length + self.pad_to_multiple_of - 1) // self.pad_to_multiple_of self.pad_to_multiple_of ) for feature in features: remainder = [self.tokenizer.pad_token_id] ( max_output_length - len(feature['output_ids']) ) if isinstance(feature['output_ids'], list): feature['output_ids'] = feature['output_ids'] + remainder else: feature['output_ids'] = np.concatenate( [feature['output_ids'], remainder] ).astype(np.int64) return super().call(features, return_tensors)
class Seq2SeqTrainer(_Seq2SeqTrainer):
Not Support for apex
@dc.dataclass class DataConfig(object): train_file: Optional[str] = None val_file: Optional[str] = None test_file: Optional[str] = None num_proc: Optional[int] = None
@dc.dataclass class FinetuningConfig(object): data_config: DataConfig
def _load_datasets( data_dir: str, data_format: str, data_files: dict[NamedSplit, str], num_proc: Optional[int], ) -> DatasetDict: if data_format == '.jsonl': dataset_dct = load_dataset( data_dir, data_files=data_files, split=None, num_proc=num_proc, ) else: raise NotImplementedError(f"Cannot load dataset in the '{data_format}' format.") return dataset_dct
class DataManager(object): def init(self, data_dir: str, data_config: DataConfig): self._num_proc = data_config.num_proc
def process_message(message): if 'tools' in message and message['role'] == 'system': for tool in message['tools']: parameters = tool['function']['parameters']['properties'] tool['function']['parameters']['properties'] = \ {k: v for k, v in parameters.items() if v is not None} elif 'tools' in message: del message['tools'] return message
def process_batch( batch: Mapping[str, Sequence], tokenizer: PreTrainedTokenizer, max_input_length: int, max_output_length: int, ) -> dict[str, list]: batched_conv = batch['messages'] batched_input_ids = [] batched_labels = []
def process_batch_eval( batch: Mapping[str, Sequence], tokenizer: PreTrainedTokenizer, max_input_length: int, max_output_length: int, ) -> dict[str, list]: batched_conv = batch['messages'] batched_input_ids = [] batched_output_ids = []
def load_tokenizer_and_model( model_dir: str, peft_config: Optional[PeftConfig] = None, ): tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) if peft_config is not None: model = AutoModelForCausalLM.from_pretrained( model_dir, trust_remote_code=True, empty_init=False, use_cache=False, torch_dtype=torch.bfloat16 # Must use BFloat 16 ) model = get_peft_model(model, peft_config) model.print_trainable_parameters() else: model = AutoModelForCausalLM.from_pretrained( model_dir, trust_remote_code=True, empty_init=False, use_cache=False, torch_dtype=torch.bfloat16 ) return tokenizer, model
def compute_metrics(eval_preds: EvalPrediction, tokenizer): batched_pred_ids, batched_label_ids = eval_preds metrics_dct = {'rouge-1': [], 'rouge-2': [], 'rouge-l': [], 'bleu-4': []} for pred_ids, label_ids in zip(batched_pred_ids, batched_label_ids): pred_txt = tokenizer.decode(pred_ids).strip() label_txt = tokenizer.decode(label_ids).strip() pred_tokens = list(jieba.cut(pred_txt)) label_tokens = list(jieba.cut(label_txt)) rouge = Rouge() scores = rouge.get_scores(' '.join(pred_tokens), ' '.join(label_tokens)) for k, v in scores[0].items(): metrics_dct[k].append(round(v['f'] * 100, 4)) metrics_dct['bleu-4'].append( sentence_bleu([label_tokens], pred_tokens, smoothing_function=SmoothingFunction().method3)) return {k: np.mean(v) for k, v in metrics_dct.items()}
@app.command() def main( data_dir: Annotated[str, typer.Argument(help='')], model_dir: Annotated[ str, typer.Argument( help='A string that specifies the model id of a pretrained model configuration hosted on huggingface.co, or a path to a directory containing a model configuration file.' ), ], config_file: Annotated[str, typer.Argument(help='')], auto_resume_from_checkpoint: str = typer.Argument( default='', help='If entered as yes, automatically use the latest save checkpoint. If it is a numerical example 12 15, use the corresponding save checkpoint. If the input is no, restart training' ), ): ft_config = FinetuningConfig.from_file(config_file) tokenizer, model = load_tokenizer_and_model(model_dir, peft_config=ft_config.peft_config) data_manager = DataManager(data_dir, ft_config.data_config)
if name == 'main': app()
Expected behavior / 期待表现
fix the bug!