InternLM / xtuner

An efficient, flexible and full-featured toolkit for fine-tuning LLM (InternLM2, Llama3, Phi3, Qwen, Mistral, ...)
https://xtuner.readthedocs.io/zh-cn/latest/
Apache License 2.0
4.01k stars 315 forks source link

Error when doing sft training according to `https://xtuner.readthedocs.io/en/latest/get_started/quickstart.html#` #899

Open YanShuang17 opened 3 months ago

YanShuang17 commented 3 months ago

cli command

CUDA_VISIBLE_DEVICES=5 xtuner train internlm_chat_7b_qlora_colorist_e5.py

content of internlm_chat_7b_qlora_colorist_e5.py

# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
                            LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)

from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory
from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
                                 VarlenAttnArgsToMessageHubHook)
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE

#######################################################################
#                          PART 1  Settings                           #
#######################################################################
# Model
pretrained_model_name_or_path = '/juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b/'
use_varlen_attn = False
# framework = 'huggingface'
work_dir = '/juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b-xxx/'
# Data
data_path = './colors/train.jsonl'
prompt_template = PROMPT_TEMPLATE.internlm2_chat
max_length = 8092
pack_to_max_length = True

# Scheduler & Optimizer
batch_size = 1  # per_device

# mmengine中定义的梯度累加参数,等价于transformers.trainer中的gradient_accumulation_steps
accumulative_counts = 1
# parallel
sequence_parallel_size = 1
accumulative_counts *= sequence_parallel_size  # ?

dataloader_num_workers = 0
max_epochs = 5
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1  # grad clip
warmup_ratio = 0.03

# Save
save_steps = 500
save_total_limit = 2  # Maximum checkpoints to keep (-1 means unlimited)

# Evaluate the generation performance during the training
evaluation_freq = 200
SYSTEM = SYSTEM_TEMPLATE.colorist
evaluation_inputs = [
    '请给我一个像天空一样清澈透明的蓝色。', 'Please give me a clear blue like the sky.'
]

#######################################################################
#                      PART 2  Model & Tokenizer                      #
#######################################################################
tokenizer = dict(
    type=AutoTokenizer.from_pretrained,
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    trust_remote_code=True,
    padding_side='right')

model = dict(
    type=SupervisedFinetune,
    use_varlen_attn=use_varlen_attn,
    llm=dict(
        type=AutoModelForCausalLM.from_pretrained,
        pretrained_model_name_or_path=pretrained_model_name_or_path,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        quantization_config=dict(
            type=BitsAndBytesConfig,
            load_in_4bit=True,
            load_in_8bit=False,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4')),
    lora=dict(
        type=LoraConfig,
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias='none',
        task_type='CAUSAL_LM'))

#######################################################################
#                      PART 3  Dataset & Dataloader                   #
#######################################################################
train_dataset = dict(
    type=process_hf_dataset,
    # dataset=dict(type=load_dataset, path=data_path),
    dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)),
    tokenizer=tokenizer,
    max_length=max_length,
    dataset_map_fn=colors_map_fn,
    template_map_fn=dict(
        type=template_map_fn_factory, template=prompt_template),
    remove_unused_columns=True,
    shuffle_before_pack=True,
    pack_to_max_length=pack_to_max_length,
    use_varlen_attn=use_varlen_attn)

train_dataloader = dict(
    batch_size=batch_size,
    num_workers=dataloader_num_workers,
    dataset=train_dataset,
    sampler=dict(type=DefaultSampler, shuffle=True),
    collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))

#######################################################################
#                    PART 4  Scheduler & Optimizer                    #
#######################################################################
# optimizer
optim_wrapper = dict(
    type=AmpOptimWrapper,
    optimizer=dict(
        type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
    clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
    accumulative_counts=accumulative_counts,
    loss_scale='dynamic',
    dtype='float16')

# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md  # noqa: E501
param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-5,
        by_epoch=True,
        begin=0,
        end=warmup_ratio * max_epochs,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        eta_min=0.0,
        by_epoch=True,
        begin=warmup_ratio * max_epochs,
        end=max_epochs,
        convert_to_iter_based=True)
]

# train, val, test setting
train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)

#######################################################################
#                           PART 5  Runtime                           #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
    # dict(type=DatasetInfoHook, tokenizer=tokenizer),
    # dict(
    #     type=EvaluateChatHook,
    #     tokenizer=tokenizer,
    #     every_n_iters=evaluation_freq,
    #     evaluation_inputs=evaluation_inputs,
    #     system=SYSTEM,
    #     prompt_template=prompt_template)
]

if use_varlen_attn:
    custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]

# configure default hooks
default_hooks = dict(
    # record the time of every iteration.
    timer=dict(type=IterTimerHook),
    # print log every 10 iterations.
    logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
    # enable the parameter scheduler.
    param_scheduler=dict(type=ParamSchedulerHook),
    # save checkpoint per `save_steps`.
    checkpoint=dict(
        type=CheckpointHook,
        by_epoch=True,
        interval=1,
        max_keep_ckpts=save_total_limit),
    # set sampler seed in distributed evrionment.
    sampler_seed=dict(type=DistSamplerSeedHook),
)

# configure environment
env_cfg = dict(
    # whether to enable cudnn benchmark
    cudnn_benchmark=False,
    # set multi process parameters
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    # set distributed parameters
    dist_cfg=dict(backend='nccl'),
)

# set visualizer
visualizer = None

# set log level
log_level = 'INFO'

# load from which checkpoint
load_from = None

# whether to resume training from the loaded checkpoint
resume = False

# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)

# set log processor
log_processor = dict(by_epoch=False)

Error info

08/16 11:10:26 - mmengine - WARNING - Dataset Dataset has no metainfo. ``dataset_meta`` in visualizer will be None.
08/16 11:10:27 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
08/16 11:10:27 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
08/16 11:10:27 - mmengine - INFO - Checkpoints will be saved to /juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b-xxx.
/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/utils/checkpoint.py:460: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
Traceback (most recent call last):
  File "/juicefs-algorithm/workspace/nlp/shuang_yan/xtuner/xtuner/tools/train.py", line 366, in <module>
    main()
  File "/juicefs-algorithm/workspace/nlp/shuang_yan/xtuner/xtuner/tools/train.py", line 362, in main
    runner.train()
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/mmengine/runner/runner.py", line 1777, in train
    model = self.train_loop.run()  # type: ignore
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/mmengine/runner/loops.py", line 287, in run
    self.run_iter(data_batch)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/mmengine/runner/loops.py", line 311, in run_iter
    outputs = self.runner.model.train_step(
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/mmengine/model/base_model/base_model.py", line 114, in train_step
    losses = self._run_forward(data, mode='loss')  # type: ignore
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/mmengine/model/base_model/base_model.py", line 361, in _run_forward
    results = self(**data, mode=mode)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/juicefs-algorithm/workspace/nlp/shuang_yan/xtuner/xtuner/model/sft.py", line 255, in forward
    return self.compute_loss(data, data_samples)
  File "/juicefs-algorithm/workspace/nlp/shuang_yan/xtuner/xtuner/model/sft.py", line 301, in compute_loss
    outputs = self.llm(**data)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/peft/peft_model.py", line 1091, in forward
    return self.base_model(
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 160, in forward
    return self.model.forward(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/shuang_yan/.cache/huggingface/modules/transformers_modules/modeling_internlm2.py", line 1047, in forward
    outputs = self.model(
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/shuang_yan/.cache/huggingface/modules/transformers_modules/modeling_internlm2.py", line 924, in forward
    layer_outputs = torch.utils.checkpoint.checkpoint(
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/_compile.py", line 24, in inner
    return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/_dynamo/eval_frame.py", line 489, in _fn
    return fn(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/_dynamo/external_utils.py", line 17, in inner
    return fn(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 482, in checkpoint
    return CheckpointFunction.apply(function, preserve, *args)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/autograd/function.py", line 553, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/utils/checkpoint.py", line 261, in forward
    outputs = run_function(*args)
  File "/home/shuang_yan/.cache/huggingface/modules/transformers_modules/modeling_internlm2.py", line 920, in custom_forward
    return module(*inputs, output_attentions, None)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/shuang_yan/.cache/huggingface/modules/transformers_modules/modeling_internlm2.py", line 639, in forward
    hidden_states, self_attn_weights, present_key_value = self.attention(
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/juicefs-algorithm/workspace/nlp/shuang_yan/xtuner/xtuner/model/modules/dispatch/internlm2.py", line 102, in internlm2_attn_forward
    cos, sin = self.rotary_emb(value_states, position_ids)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/shuang_yan/.cache/huggingface/modules/transformers_modules/modeling_internlm2.py", line 159, in forward
    if seq_len > self.max_seq_len_cached:
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
YanShuang17 commented 3 months ago

But when I replace internlm model with qwen1.4-4b-chat,it works...

content of internlm_chat_7b_qlora_colorist_e5.py

# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
                            LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)

from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory
from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
                                 VarlenAttnArgsToMessageHubHook)
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE

#######################################################################
#                          PART 1  Settings                           #
#######################################################################
# Model
# pretrained_model_name_or_path = '/juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b/'
pretrained_model_name_or_path = '/juicefs-algorithm/models/nlp/huggingface/Qwen/Qwen1.5-4B-Chat'
use_varlen_attn = False
# framework = 'huggingface'
work_dir = '/juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b-xxx/'
# Data
data_path = './colors/train.jsonl'
# prompt_template = PROMPT_TEMPLATE.internlm2_chat
prompt_template = PROMPT_TEMPLATE.qwen_chat
max_length = 2048

pack_to_max_length = True

# Scheduler & Optimizer
batch_size = 1  # per_device

# mmengine中定义的梯度累加参数,等价于transformers.trainer中的gradient_accumulation_steps
accumulative_counts = 1
# parallel
sequence_parallel_size = 1
accumulative_counts *= sequence_parallel_size  # ?

dataloader_num_workers = 0
max_epochs = 5
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1  # grad clip
warmup_ratio = 0.03

# Save
save_steps = 500
save_total_limit = 2  # Maximum checkpoints to keep (-1 means unlimited)

# Evaluate the generation performance during the training
evaluation_freq = 200
SYSTEM = SYSTEM_TEMPLATE.colorist
evaluation_inputs = [
    '请给我一个像天空一样清澈透明的蓝色。', 'Please give me a clear blue like the sky.'
]

#######################################################################
#                      PART 2  Model & Tokenizer                      #
#######################################################################
tokenizer = dict(
    type=AutoTokenizer.from_pretrained,
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    trust_remote_code=True,
    padding_side='right')

model = dict(
    type=SupervisedFinetune,
    use_varlen_attn=use_varlen_attn,
    llm=dict(
        type=AutoModelForCausalLM.from_pretrained,
        pretrained_model_name_or_path=pretrained_model_name_or_path,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        quantization_config=dict(
            type=BitsAndBytesConfig,
            load_in_4bit=True,
            load_in_8bit=False,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4')),
    lora=dict(
        type=LoraConfig,
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias='none',
        task_type='CAUSAL_LM'))

#######################################################################
#                      PART 3  Dataset & Dataloader                   #
#######################################################################
train_dataset = dict(
    type=process_hf_dataset,
    # dataset=dict(type=load_dataset, path=data_path),
    dataset=dict(type=load_dataset, path='json', data_files=dict(train=data_path)),
    tokenizer=tokenizer,
    max_length=max_length,
    dataset_map_fn=colors_map_fn,
    template_map_fn=dict(
        type=template_map_fn_factory, template=prompt_template),
    remove_unused_columns=True,
    shuffle_before_pack=True,
    pack_to_max_length=pack_to_max_length,
    use_varlen_attn=use_varlen_attn)

train_dataloader = dict(
    batch_size=batch_size,
    num_workers=dataloader_num_workers,
    dataset=train_dataset,
    sampler=dict(type=DefaultSampler, shuffle=True),
    collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))

#######################################################################
#                    PART 4  Scheduler & Optimizer                    #
#######################################################################
# optimizer
optim_wrapper = dict(
    type=AmpOptimWrapper,
    optimizer=dict(
        type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
    clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
    accumulative_counts=accumulative_counts,
    loss_scale='dynamic',
    dtype='float16')

# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md  # noqa: E501
param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-5,
        by_epoch=True,
        begin=0,
        end=warmup_ratio * max_epochs,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        eta_min=0.0,
        by_epoch=True,
        begin=warmup_ratio * max_epochs,
        end=max_epochs,
        convert_to_iter_based=True)
]

# train, val, test setting
train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)

#######################################################################
#                           PART 5  Runtime                           #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
    # dict(type=DatasetInfoHook, tokenizer=tokenizer),
    # dict(
    #     type=EvaluateChatHook,
    #     tokenizer=tokenizer,
    #     every_n_iters=evaluation_freq,
    #     evaluation_inputs=evaluation_inputs,
    #     system=SYSTEM,
    #     prompt_template=prompt_template)
]

if use_varlen_attn:
    custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]

# configure default hooks
default_hooks = dict(
    # record the time of every iteration.
    timer=dict(type=IterTimerHook),
    # print log every 10 iterations.
    logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
    # enable the parameter scheduler.
    param_scheduler=dict(type=ParamSchedulerHook),
    # save checkpoint per `save_steps`.
    checkpoint=dict(
        type=CheckpointHook,
        by_epoch=True,
        interval=1,
        max_keep_ckpts=save_total_limit),
    # set sampler seed in distributed evrionment.
    sampler_seed=dict(type=DistSamplerSeedHook),
)

# configure environment
env_cfg = dict(
    # whether to enable cudnn benchmark
    cudnn_benchmark=False,
    # set multi process parameters
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    # set distributed parameters
    dist_cfg=dict(backend='nccl'),
)

# set visualizer
visualizer = None

# set log level
log_level = 'INFO'

# load from which checkpoint
load_from = None

# whether to resume training from the loaded checkpoint
resume = False

# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)

# set log processor
log_processor = dict(by_epoch=False)

log info

08/16 11:26:31 - mmengine - WARNING - Dataset Dataset has no metainfo. ``dataset_meta`` in visualizer will be None.
08/16 11:26:31 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
08/16 11:26:31 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
08/16 11:26:31 - mmengine - INFO - Checkpoints will be saved to /juicefs-algorithm/models/nlp/huggingface/internlm/internlm2-chat-1_8b-xxx.
/data/shuang_yan/qx_agent/lib/python3.10/site-packages/torch/utils/checkpoint.py:460: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.
  warnings.warn(
08/16 11:26:39 - mmengine - INFO - Iter(train) [  10/5705]  lr: 1.0590e-05  eta: 1:20:07  time: 0.8442  data_time: 0.0113  memory: 11785  grad_norm: 0.7445  loss: 2.2287
08/16 11:26:48 - mmengine - INFO - Iter(train) [  20/5705]  lr: 2.2355e-05  eta: 1:18:24  time: 0.8109  data_time: 0.0158  memory: 11785  grad_norm: 0.6206  loss: 2.0102
08/16 11:26:56 - mmengine - INFO - Iter(train) [  30/5705]  lr: 3.4119e-05  eta: 1:17:46  time: 0.8118  data_time: 0.0144  memory: 11785  grad_norm: 0.4532  loss: 1.8296
08/16 11:27:04 - mmengine - INFO - Iter(train) [  40/5705]  lr: 4.5884e-05  eta: 1:17:49  time: 0.8300  data_time: 0.0144  memory: 11785  grad_norm: nan  loss: 1.7190
08/16 11:27:12 - mmengine - INFO - Iter(train) [  50/5705]  lr: 5.7648e-05  eta: 1:17:33  time: 0.8171  data_time: 0.0148  memory: 11785  grad_norm: 0.6070  loss: 1.6344
08/16 11:27:20 - mmengine - INFO - Iter(train) [  60/5705]  lr: 6.9413e-05  eta: 1:17:19  time: 0.8173  data_time: 0.0142  memory: 11785  grad_norm: 0.8265  loss: 1.5743
08/16 11:27:29 - mmengine - INFO - Iter(train) [  70/5705]  lr: 8.1178e-05  eta: 1:17:09  time: 0.8196  data_time: 0.0149  memory: 11785  grad_norm: 1.3018  loss: 1.4253
08/16 11:27:37 - mmengine - INFO - Iter(train) [  80/5705]  lr: 9.2942e-05  eta: 1:17:02  time: 0.8233  data_time: 0.0156  memory: 11785  grad_norm: 0.6258  loss: 1.4175
08/16 11:27:45 - mmengine - INFO - Iter(train) [  90/5705]  lr: 1.0471e-04  eta: 1:16:55  time: 0.8240  data_time: 0.0155  memory: 11785  grad_norm: 0.6738  loss: 1.4359
08/16 11:27:53 - mmengine - INFO - Iter(train) [ 100/5705]  lr: 1.1647e-04  eta: 1:16:49  time: 0.8253  data_time: 0.0151  memory: 11785  grad_norm: 0.6601  loss: 1.3662
08/16 11:28:01 - mmengine - INFO - Iter(train) [ 110/5705]  lr: 1.2824e-04  eta: 1:16:42  time: 0.8246  data_time: 0.0139  memory: 11785  grad_norm: 0.6078  loss: 1.2998
08/16 11:28:10 - mmengine - INFO - Iter(train) [ 120/5705]  lr: 1.4000e-04  eta: 1:16:36  time: 0.8272  data_time: 0.0153  memory: 11785  grad_norm: 0.6458  loss: 1.3379
08/16 11:28:18 - mmengine - INFO - Iter(train) [ 130/5705]  lr: 1.5177e-04  eta: 1:16:30  time: 0.8283  data_time: 0.0157  memory: 11785  grad_norm: 0.5611  loss: 1.2955
08/16 11:28:26 - mmengine - INFO - Iter(train) [ 140/5705]  lr: 1.6353e-04  eta: 1:16:23  time: 0.8282  data_time: 0.0150  memory: 11785  grad_norm: 0.5902  loss: 1.2743
08/16 11:28:35 - mmengine - INFO - Iter(train) [ 150/5705]  lr: 1.7529e-04  eta: 1:16:17  time: 0.8291  data_time: 0.0153  memory: 11785  grad_norm: 0.4927  loss: 1.3233
08/16 11:28:43 - mmengine - INFO - Iter(train) [ 160/5705]  lr: 1.8706e-04  eta: 1:16:10  time: 0.8279  data_time: 0.0156  memory: 11785  grad_norm: nan  loss: 1.2841
08/16 11:28:51 - mmengine - INFO - Iter(train) [ 170/5705]  lr: 1.9882e-04  eta: 1:16:04  time: 0.8290  data_time: 0.0152  memory: 11785  grad_norm: 0.4845  loss: 1.2512
08/16 11:28:59 - mmengine - INFO - Iter(train) [ 180/5705]  lr: 2.0000e-04  eta: 1:15:57  time: 0.8297  data_time: 0.0159  memory: 11785  grad_norm: 0.7162  loss: 1.2590
08/16 11:29:08 - mmengine - INFO - Iter(train) [ 190/5705]  lr: 1.9999e-04  eta: 1:15:50  time: 0.8279  data_time: 0.0142  memory: 11785  grad_norm: 0.4851  loss: 1.2564
08/16 11:29:16 - mmengine - INFO - Iter(train) [ 200/5705]  lr: 1.9999e-04  eta: 1:15:42  time: 0.8290  data_time: 0.0153  memory: 11785  grad_norm: 0.5706  loss: 1.2485
08/16 11:29:24 - mmengine - INFO - Iter(train) [ 210/5705]  lr: 1.9998e-04  eta: 1:15:35  time: 0.8292  data_time: 0.0154  memory: 11785  grad_norm: 0.4805  loss: 1.2792
08/16 11:29:33 - mmengine - INFO - Iter(train) [ 220/5705]  lr: 1.9996e-04  eta: 1:15:28  time: 0.8307  data_time: 0.0167  memory: 11785  grad_norm: 0.6354  loss: 1.3057
08/16 11:29:41 - mmengine - INFO - Iter(train) [ 230/5705]  lr: 1.9995e-04  eta: 1:15:21  time: 0.8289  data_time: 0.0147  memory: 11785  grad_norm: 0.3977  loss: 1.2304
08/16 11:29:49 - mmengine - INFO - Iter(train) [ 240/5705]  lr: 1.9993e-04  eta: 1:15:13  time: 0.8288  data_time: 0.0148  memory: 11785  grad_norm: 0.4613  loss: 1.2297
08/16 11:29:58 - mmengine - INFO - Iter(train) [ 250/5705]  lr: 1.9990e-04  eta: 1:15:06  time: 0.8299  data_time: 0.0155  memory: 11785  grad_norm: 0.6767  loss: 1.1892
08/16 11:30:06 - mmengine - INFO - Iter(train) [ 260/5705]  lr: 1.9988e-04  eta: 1:14:58  time: 0.8304  data_time: 0.0159  memory: 11785  grad_norm: 0.5016  loss: 1.2711
08/16 11:30:14 - mmengine - INFO - Iter(train) [ 270/5705]  lr: 1.9985e-04  eta: 1:14:51  time: 0.8279  data_time: 0.0142  memory: 11785  grad_norm: 0.4309  loss: 1.2442
08/16 11:30:22 - mmengine - INFO - Iter(train) [ 280/5705]  lr: 1.9981e-04  eta: 1:14:42  time: 0.8274  data_time: 0.0135  memory: 11785  grad_norm: 0.4576  loss: 1.2365
...