Open YanShuang17 opened 3 months ago
But when I replace internlm model with qwen1.4-4b-chat
,it works...
internlm_chat_7b_qlora_colorist_e5.py
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
BitsAndBytesConfig)
from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory
from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
VarlenAttnArgsToMessageHubHook)
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE
#######################################################################
# PART 1 Settings #
#######################################################################
# Model
# pretrained_model_name_or_path = '/juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b/'
pretrained_model_name_or_path = '/juicefs-algorithm/models/nlp/huggingface/Qwen/Qwen1.5-4B-Chat'
use_varlen_attn = False
# framework = 'huggingface'
work_dir = '/juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b-xxx/'
# Data
data_path = './colors/train.jsonl'
# prompt_template = PROMPT_TEMPLATE.internlm2_chat
prompt_template = PROMPT_TEMPLATE.qwen_chat
max_length = 2048
pack_to_max_length = True
# Scheduler & Optimizer
batch_size = 1 # per_device
# mmengine中定义的梯度累加参数,等价于transformers.trainer中的gradient_accumulation_steps
accumulative_counts = 1
# parallel
sequence_parallel_size = 1
accumulative_counts *= sequence_parallel_size # ?
dataloader_num_workers = 0
max_epochs = 5
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1 # grad clip
warmup_ratio = 0.03
# Save
save_steps = 500
save_total_limit = 2 # Maximum checkpoints to keep (-1 means unlimited)
# Evaluate the generation performance during the training
evaluation_freq = 200
SYSTEM = SYSTEM_TEMPLATE.colorist
evaluation_inputs = [
'请给我一个像天空一样清澈透明的蓝色。', 'Please give me a clear blue like the sky.'
]
#######################################################################
# PART 2 Model & Tokenizer #
#######################################################################
tokenizer = dict(
type=AutoTokenizer.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
padding_side='right')
model = dict(
type=SupervisedFinetune,
use_varlen_attn=use_varlen_attn,
llm=dict(
type=AutoModelForCausalLM.from_pretrained,
pretrained_model_name_or_path=pretrained_model_name_or_path,
trust_remote_code=True,
torch_dtype=torch.float16,
quantization_config=dict(
type=BitsAndBytesConfig,
load_in_4bit=True,
load_in_8bit=False,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4')),
lora=dict(
type=LoraConfig,
r=64,
lora_alpha=16,
lora_dropout=0.1,
bias='none',
task_type='CAUSAL_LM'))
#######################################################################
# PART 3 Dataset & Dataloader #
#######################################################################
train_dataset = dict(
type=process_hf_dataset,
# dataset=dict(type=load_dataset, path=data_path),
dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)),
tokenizer=tokenizer,
max_length=max_length,
dataset_map_fn=colors_map_fn,
template_map_fn=dict(
type=template_map_fn_factory, template=prompt_template),
remove_unused_columns=True,
shuffle_before_pack=True,
pack_to_max_length=pack_to_max_length,
use_varlen_attn=use_varlen_attn)
train_dataloader = dict(
batch_size=batch_size,
num_workers=dataloader_num_workers,
dataset=train_dataset,
sampler=dict(type=DefaultSampler, shuffle=True),
collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))
#######################################################################
# PART 4 Scheduler & Optimizer #
#######################################################################
# optimizer
optim_wrapper = dict(
type=AmpOptimWrapper,
optimizer=dict(
type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
accumulative_counts=accumulative_counts,
loss_scale='dynamic',
dtype='float16')
# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501
param_scheduler = [
dict(
type=LinearLR,
start_factor=1e-5,
by_epoch=True,
begin=0,
end=warmup_ratio * max_epochs,
convert_to_iter_based=True),
dict(
type=CosineAnnealingLR,
eta_min=0.0,
by_epoch=True,
begin=warmup_ratio * max_epochs,
end=max_epochs,
convert_to_iter_based=True)
]
# train, val, test setting
train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)
#######################################################################
# PART 5 Runtime #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
# dict(type=DatasetInfoHook, tokenizer=tokenizer),
# dict(
# type=EvaluateChatHook,
# tokenizer=tokenizer,
# every_n_iters=evaluation_freq,
# evaluation_inputs=evaluation_inputs,
# system=SYSTEM,
# prompt_template=prompt_template)
]
if use_varlen_attn:
custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]
# configure default hooks
default_hooks = dict(
# record the time of every iteration.
timer=dict(type=IterTimerHook),
# print log every 10 iterations.
logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
# enable the parameter scheduler.
param_scheduler=dict(type=ParamSchedulerHook),
# save checkpoint per `save_steps`.
checkpoint=dict(
type=CheckpointHook,
by_epoch=True,
interval=1,
max_keep_ckpts=save_total_limit),
# set sampler seed in distributed evrionment.
sampler_seed=dict(type=DistSamplerSeedHook),
)
# configure environment
env_cfg = dict(
# whether to enable cudnn benchmark
cudnn_benchmark=False,
# set multi process parameters
mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
# set distributed parameters
dist_cfg=dict(backend='nccl'),
)
# set visualizer
visualizer = None
# set log level
log_level = 'INFO'
# load from which checkpoint
load_from = None
# whether to resume training from the loaded checkpoint
resume = False
# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)
# set log processor
log_processor = dict(by_epoch=False)
08/16 11:26:31 - mmengine - WARNING - Dataset Dataset has no metainfo. ``dataset_meta`` in visualizer will be None.
08/16 11:26:31 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
08/16 11:26:31 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
08/16 11:26:31 - mmengine - INFO - Checkpoints will be saved to /juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b-xxx.
/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/utils/checkpoint.py:460: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
warnings.warn(
08/16 11:26:39 - mmengine - INFO - Iter(train) [ 10/5705] lr: 1.0590e-05 eta: 1:20:07 time: 0.8442 data_time: 0.0113 memory: 11785 grad_norm: 0.7445 loss: 2.2287
08/16 11:26:48 - mmengine - INFO - Iter(train) [ 20/5705] lr: 2.2355e-05 eta: 1:18:24 time: 0.8109 data_time: 0.0158 memory: 11785 grad_norm: 0.6206 loss: 2.0102
08/16 11:26:56 - mmengine - INFO - Iter(train) [ 30/5705] lr: 3.4119e-05 eta: 1:17:46 time: 0.8118 data_time: 0.0144 memory: 11785 grad_norm: 0.4532 loss: 1.8296
08/16 11:27:04 - mmengine - INFO - Iter(train) [ 40/5705] lr: 4.5884e-05 eta: 1:17:49 time: 0.8300 data_time: 0.0144 memory: 11785 grad_norm: nan loss: 1.7190
08/16 11:27:12 - mmengine - INFO - Iter(train) [ 50/5705] lr: 5.7648e-05 eta: 1:17:33 time: 0.8171 data_time: 0.0148 memory: 11785 grad_norm: 0.6070 loss: 1.6344
08/16 11:27:20 - mmengine - INFO - Iter(train) [ 60/5705] lr: 6.9413e-05 eta: 1:17:19 time: 0.8173 data_time: 0.0142 memory: 11785 grad_norm: 0.8265 loss: 1.5743
08/16 11:27:29 - mmengine - INFO - Iter(train) [ 70/5705] lr: 8.1178e-05 eta: 1:17:09 time: 0.8196 data_time: 0.0149 memory: 11785 grad_norm: 1.3018 loss: 1.4253
08/16 11:27:37 - mmengine - INFO - Iter(train) [ 80/5705] lr: 9.2942e-05 eta: 1:17:02 time: 0.8233 data_time: 0.0156 memory: 11785 grad_norm: 0.6258 loss: 1.4175
08/16 11:27:45 - mmengine - INFO - Iter(train) [ 90/5705] lr: 1.0471e-04 eta: 1:16:55 time: 0.8240 data_time: 0.0155 memory: 11785 grad_norm: 0.6738 loss: 1.4359
08/16 11:27:53 - mmengine - INFO - Iter(train) [ 100/5705] lr: 1.1647e-04 eta: 1:16:49 time: 0.8253 data_time: 0.0151 memory: 11785 grad_norm: 0.6601 loss: 1.3662
08/16 11:28:01 - mmengine - INFO - Iter(train) [ 110/5705] lr: 1.2824e-04 eta: 1:16:42 time: 0.8246 data_time: 0.0139 memory: 11785 grad_norm: 0.6078 loss: 1.2998
08/16 11:28:10 - mmengine - INFO - Iter(train) [ 120/5705] lr: 1.4000e-04 eta: 1:16:36 time: 0.8272 data_time: 0.0153 memory: 11785 grad_norm: 0.6458 loss: 1.3379
08/16 11:28:18 - mmengine - INFO - Iter(train) [ 130/5705] lr: 1.5177e-04 eta: 1:16:30 time: 0.8283 data_time: 0.0157 memory: 11785 grad_norm: 0.5611 loss: 1.2955
08/16 11:28:26 - mmengine - INFO - Iter(train) [ 140/5705] lr: 1.6353e-04 eta: 1:16:23 time: 0.8282 data_time: 0.0150 memory: 11785 grad_norm: 0.5902 loss: 1.2743
08/16 11:28:35 - mmengine - INFO - Iter(train) [ 150/5705] lr: 1.7529e-04 eta: 1:16:17 time: 0.8291 data_time: 0.0153 memory: 11785 grad_norm: 0.4927 loss: 1.3233
08/16 11:28:43 - mmengine - INFO - Iter(train) [ 160/5705] lr: 1.8706e-04 eta: 1:16:10 time: 0.8279 data_time: 0.0156 memory: 11785 grad_norm: nan loss: 1.2841
08/16 11:28:51 - mmengine - INFO - Iter(train) [ 170/5705] lr: 1.9882e-04 eta: 1:16:04 time: 0.8290 data_time: 0.0152 memory: 11785 grad_norm: 0.4845 loss: 1.2512
08/16 11:28:59 - mmengine - INFO - Iter(train) [ 180/5705] lr: 2.0000e-04 eta: 1:15:57 time: 0.8297 data_time: 0.0159 memory: 11785 grad_norm: 0.7162 loss: 1.2590
08/16 11:29:08 - mmengine - INFO - Iter(train) [ 190/5705] lr: 1.9999e-04 eta: 1:15:50 time: 0.8279 data_time: 0.0142 memory: 11785 grad_norm: 0.4851 loss: 1.2564
08/16 11:29:16 - mmengine - INFO - Iter(train) [ 200/5705] lr: 1.9999e-04 eta: 1:15:42 time: 0.8290 data_time: 0.0153 memory: 11785 grad_norm: 0.5706 loss: 1.2485
08/16 11:29:24 - mmengine - INFO - Iter(train) [ 210/5705] lr: 1.9998e-04 eta: 1:15:35 time: 0.8292 data_time: 0.0154 memory: 11785 grad_norm: 0.4805 loss: 1.2792
08/16 11:29:33 - mmengine - INFO - Iter(train) [ 220/5705] lr: 1.9996e-04 eta: 1:15:28 time: 0.8307 data_time: 0.0167 memory: 11785 grad_norm: 0.6354 loss: 1.3057
08/16 11:29:41 - mmengine - INFO - Iter(train) [ 230/5705] lr: 1.9995e-04 eta: 1:15:21 time: 0.8289 data_time: 0.0147 memory: 11785 grad_norm: 0.3977 loss: 1.2304
08/16 11:29:49 - mmengine - INFO - Iter(train) [ 240/5705] lr: 1.9993e-04 eta: 1:15:13 time: 0.8288 data_time: 0.0148 memory: 11785 grad_norm: 0.4613 loss: 1.2297
08/16 11:29:58 - mmengine - INFO - Iter(train) [ 250/5705] lr: 1.9990e-04 eta: 1:15:06 time: 0.8299 data_time: 0.0155 memory: 11785 grad_norm: 0.6767 loss: 1.1892
08/16 11:30:06 - mmengine - INFO - Iter(train) [ 260/5705] lr: 1.9988e-04 eta: 1:14:58 time: 0.8304 data_time: 0.0159 memory: 11785 grad_norm: 0.5016 loss: 1.2711
08/16 11:30:14 - mmengine - INFO - Iter(train) [ 270/5705] lr: 1.9985e-04 eta: 1:14:51 time: 0.8279 data_time: 0.0142 memory: 11785 grad_norm: 0.4309 loss: 1.2442
08/16 11:30:22 - mmengine - INFO - Iter(train) [ 280/5705] lr: 1.9981e-04 eta: 1:14:42 time: 0.8274 data_time: 0.0135 memory: 11785 grad_norm: 0.4576 loss: 1.2365
...
cli command
content of
internlm_chat_7b_qlora_colorist_e5.py
Error info