InternLM / xtuner

An efficient, flexible and full-featured toolkit for fine-tuning LLM (InternLM2, Llama3, Phi3, Qwen, Mistral, ...)
https://xtuner.readthedocs.io/zh-cn/latest/
Apache License 2.0
3.62k stars 296 forks source link

[Bug] internlm qlora #493

Closed rourouZ closed 5 months ago

rourouZ commented 5 months ago

Traceback (most recent call last): File "/icooper/tools/miniconda3/envs/internLM/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 649, in iflatmap_unordered yield queue.get(timeout=0.05) File "", line 2, in get File "/icooper/tools/miniconda3/envs/internLM/lib/python3.10/site-packages/multiprocess/managers.py", line 818, in _callmethod kind, result = conn.recv() File "/icooper/tools/miniconda3/envs/internLM/lib/python3.10/site-packages/multiprocess/connection.py", line 253, in recv buf = self._recv_bytes() File "/icooper/tools/miniconda3/envs/internLM/lib/python3.10/site-packages/multiprocess/connection.py", line 417, in _recv_bytes buf = self._recv(4) File "/icooper/tools/miniconda3/envs/internLM/lib/python3.10/site-packages/multiprocess/connection.py", line 382, in _recv chunk = read(handle, remaining) ConnectionResetError: [Errno 104] Connection reset by peer

During handling of the above exception, another exception occurred:

multiprocess.context.TimeoutError Exception ignored in tp_clear of: <class 'cell'> SystemError: Negative size passed to PyUnicode_New Exception ignored in: <built-in method _destroy of weakref.ReferenceType object at 0x7fb11ff47b00> SystemError: <built-in method _destroy of weakref.ReferenceType object at 0x7fb11ff47b00> returned a result with an exception set

LZHgrla commented 5 months ago

@rourouZ
Please provide more details about your dataset, configs and commands

rourouZ commented 5 months ago

命令

CUDA_VISIBLE_DEVICES=1 xtuner train {configPath}

数据集测试

xtuner check-custom-dataset 测试通过

config

# Copyright (c) OpenMMLab. All rights reserved.
import torch
from glob import glob
from pathlib import Path
from datasets import load_dataset

from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
                            LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR

from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)

from xtuner.dataset import ConcatDataset, process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import (crime_kg_assitant_map_fn,
                                    law_reference_map_fn,
                                    emergency_reference_map_fn,
                                    template_map_fn_factory)
from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
                                 VarlenAttnArgsToMessageHubHook)
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE

#######################################################################
#                          PART 1  Settings                           #
#######################################################################
# Model
pretrained_model_name_or_path = '/weights/Shanghai_AI_Laboratory/internlm2-chat-7b'
use_varlen_attn = False

# Data
data_path = "/dataset/internlm_train/test"
prompt_template = PROMPT_TEMPLATE.internlm2_chat
max_length = 2048
pack_to_max_length = True

# Scheduler & Optimizer
batch_size = 1  # per_device
accumulative_counts = 16
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1  # grad clip
warmup_ratio = 0.03

# Save
save_steps = 10
save_total_limit = 2  # Maximum checkpoints to keep (-1 means unlimited)

# Evaluate the generation performance during the training
evaluation_freq = 100
# SYSTEM = SYSTEM_TEMPLATE.emergency
SYSTEM = ''

evaluation_inputs = ['你叫什么名字?', '乙醇的沸点是多少?']

#######################################################################
#                      PART 2  Model & Tokenizer                      #
#######################################################################
tokenizer = dict(
    type=AutoTokenizer.from_pretrained,
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    trust_remote_code=True,
    padding_side='right')

model = dict(
    type=SupervisedFinetune,
    # use_varlen_attn=use_varlen_attn,
    llm=dict(
        type=AutoModelForCausalLM.from_pretrained,
        pretrained_model_name_or_path=pretrained_model_name_or_path,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        quantization_config=dict(
            type=BitsAndBytesConfig,
            load_in_4bit=False,
            load_in_8bit=False,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4')),
    lora=dict(
        type=LoraConfig,
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias='none',
        task_type='CAUSAL_LM'))

#######################################################################
#                      PART 3  Dataset & Dataloader                   #
#######################################################################
all_file_list = glob(pathname=Path(data_path).joinpath("*.json").__str__())

data_assitant = dict(
    type=process_hf_dataset,
    dataset=dict(type=load_dataset, path=data_path),
    # dataset=dict(
    #     type=load_dataset,
    #     path='json',
    #     data_files=all_file_list),
    tokenizer=tokenizer,
    max_length=max_length,
    dataset_map_fn=None,
    template_map_fn=dict(
        type=template_map_fn_factory, template=prompt_template),
    remove_unused_columns=True,
    shuffle_before_pack=True,
    pack_to_max_length=pack_to_max_length,
    use_varlen_attn=use_varlen_attn)

# train_dataset = dict(
#     type=ConcatDataset, datasets=[data_assitant])

train_dataloader = dict(
    batch_size=batch_size,
    num_workers=dataloader_num_workers,
    dataset=data_assitant,
    sampler=dict(type=DefaultSampler, shuffle=True),
    collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))

#######################################################################
#                    PART 4  Scheduler & Optimizer                    #
#######################################################################
# optimizer
optim_wrapper = dict(
    type=AmpOptimWrapper,
    optimizer=dict(
        type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
    clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
    accumulative_counts=accumulative_counts,
    loss_scale='dynamic',
    dtype='float16')

# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md  # noqa: E501
param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-5,
        by_epoch=True,
        begin=0,
        end=warmup_ratio * max_epochs,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        eta_min=0.0,
        by_epoch=True,
        begin=warmup_ratio * max_epochs,
        end=max_epochs,
        convert_to_iter_based=True)
]

# train, val, test setting
train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)

#######################################################################
#                           PART 5  Runtime                           #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
    dict(type=DatasetInfoHook, tokenizer=tokenizer),
    dict(
        type=EvaluateChatHook,
        tokenizer=tokenizer,
        every_n_iters=evaluation_freq,
        evaluation_inputs=evaluation_inputs,
        system=SYSTEM,
        prompt_template=prompt_template)
]

if use_varlen_attn:
    custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]

# configure default hooks
default_hooks = dict(
    # record the time of every iteration.
    timer=dict(type=IterTimerHook),
    # print log every 10 iterations.
    logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
    # enable the parameter scheduler.
    param_scheduler=dict(type=ParamSchedulerHook),
    # save checkpoint per `save_steps`.
    checkpoint=dict(
        type=CheckpointHook,
        by_epoch=False,
        interval=save_steps,
        max_keep_ckpts=save_total_limit),
    # set sampler seed in distributed evrionment.
    sampler_seed=dict(type=DistSamplerSeedHook),
)

# configure environment
env_cfg = dict(
    # whether to enable cudnn benchmark
    cudnn_benchmark=False,
    # set multi process parameters
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    # set distributed parameters
    dist_cfg=dict(backend='nccl'),
)

# set visualizer
visualizer = None

# set log level
log_level = 'INFO'

# load from which checkpoint
load_from = None

# whether to resume training from the loaded checkpoint
resume = False

# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)

# set log processor
log_processor = dict(by_epoch=False)
LZHgrla commented 5 months ago

@rourouZ

Please set map_num_proc=1 for process_hf_dataset to disable the multi-process during the dataset preprocess

data_assitant = dict(
    type=process_hf_dataset,
+   map_num_proc=1,
    dataset=dict(type=load_dataset, path=data_path),
    # dataset=dict(
    #     type=load_dataset,
    #     path='json',
    #     data_files=all_file_list),
    tokenizer=tokenizer,
    max_length=max_length,
    dataset_map_fn=None,
    template_map_fn=dict(
        type=template_map_fn_factory, template=prompt_template),
    remove_unused_columns=True,
    shuffle_before_pack=True,
    pack_to_max_length=pack_to_max_length,
    use_varlen_attn=use_varlen_attn)
rourouZ commented 5 months ago
map_num_proc=1,

可以了,非常感谢