InternLM / xtuner

An efficient, flexible and full-featured toolkit for fine-tuning LLM (InternLM2, Llama3, Phi3, Qwen, Mistral, ...)
https://xtuner.readthedocs.io/zh-cn/latest/
Apache License 2.0
3.95k stars 309 forks source link

使用官方脚本对应数据集提示列名不匹配,使用同格式自定义数据集报错 #692

Open LumenScope opened 6 months ago

LumenScope commented 6 months ago

执行脚本:qwen1_5_14b_chat_qlora_alpaca_e3 数据集:tatsu-lab/alpaca

获取数据集到本地的代码:

from datasets import load_dataset
import json

# 载入数据集
dataset = load_dataset("tatsu-lab/alpaca")

# 假设我们只处理训练集部分
data = dataset['train']

# 将数据保存为 JSONL 文件
with open('/home/tangshi/TangShi/Pku政务大模型/Trainer/Tools/xtuner/data/alpaca_train.jsonl', 'w', encoding='utf-8') as f:
    for item in data:
        json_line = json.dumps(item)  # 将字典转换为 JSON 字符串
        f.write(json_line + '\n')  # 写入文件并添加换行符

获取到的JSONL样例展示:

{"instruction": "Give three tips for staying healthy.", "input": "", "output": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.", "text": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."}
...

运行以下脚本NPROC_PER_NODE=4 xtuner train /work/tzz/xtuner/config/qwen1_5_14b_chat_qlora_alpaca_e3_copy.py --deepspeed deepspeed_zero3^C

# 列出所有内置配置
# xtuner list-cfg
# 复制到指定路径
# xtuner copy-cfg {cfg_name} /work/tzz/xtuner/config
# 训练
# NPROC_PER_NODE=4 xtuner train /work/tzz/xtuner/config/qwen1_5_14b_chat_qlora_alpaca_e3_copy.py --deepspeed deepspeed_zero3
# 转化格式
# xtuner convert pth_to_hf ${CONFIG_NAME_OR_PATH} ${PTH_file_dir} ${SAVE_PATH}
# 合并lora
# xtuner convert merge \
#     ${NAME_OR_PATH_TO_LLM} \
#     ${NAME_OR_PATH_TO_ADAPTER} \
#     ${SAVE_PATH} \

# Copyright (c) OpenMMLab. All rights reserved.
import torch
from datasets import load_dataset
from mmengine.dataset import DefaultSampler
from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook,
                            LoggerHook, ParamSchedulerHook)
from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR
from peft import LoraConfig
from torch.optim import AdamW
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig)

from xtuner.dataset import process_hf_dataset
from xtuner.dataset.collate_fns import default_collate_fn
from xtuner.dataset.map_fns import  template_map_fn_factory
from xtuner.engine.hooks import (DatasetInfoHook, EvaluateChatHook,
                                 VarlenAttnArgsToMessageHubHook)
from xtuner.engine.runner import TrainLoop
from xtuner.model import SupervisedFinetune
from xtuner.parallel.sequence import SequenceParallelSampler
from xtuner.utils import PROMPT_TEMPLATE, SYSTEM_TEMPLATE

#######################################################################
#                          PART 1  Settings                           #
#######################################################################
# Model
pretrained_model_name_or_path = '/work/tzz/model/Qwen1.5-14B-Chat'
use_varlen_attn = False

# Data
alpaca_en_path = '/work/tzz/xtuner/data'
prompt_template = PROMPT_TEMPLATE.qwen_chat
max_length = 2048
pack_to_max_length = True

# parallel
sequence_parallel_size = 1

# Scheduler & Optimizer
batch_size = 1  # per_device
accumulative_counts = 16
accumulative_counts *= sequence_parallel_size
dataloader_num_workers = 0
max_epochs = 3
optim_type = AdamW
lr = 2e-4
betas = (0.9, 0.999)
weight_decay = 0
max_norm = 1  # grad clip
warmup_ratio = 0.03

# Save
save_steps = 500
save_total_limit = 2  # Maximum checkpoints to keep (-1 means unlimited)

# Evaluate the generation performance during the training
evaluation_freq = 500
SYSTEM = "你的任务是重庆市政务文书写作、政务问答。\n参照你固有的知识或者我给出的法律文献,在引用法律文件时使用《》包裹其名称。\n"
evaluation_inputs = [
    '信件标题:平潭综合实验区政策咨询\n信件内容:如何申请成为重庆市政府的平潭综合实验区政策援助对象?范围和条件分别是什么?', '信件标题:询问步行街烟火管理政策\n信件内容:您好,我想开一家熟食摊位在我所在区的步行街,我想请问重庆市对于步行街烟火管理有没有特定的政策规定需要我们遵循?'
]
def SYSTEM_map_fn(example):
    return {
        'conversation': [{
            'system': f"{example['instruction']}",
            'input': f"{example['input']}",
            'output': example['output']
        }]
    }
#######################################################################
#                      PART 2  Model & Tokenizer                      #
#######################################################################
tokenizer = dict(
    type=AutoTokenizer.from_pretrained,
    pretrained_model_name_or_path=pretrained_model_name_or_path,
    trust_remote_code=True,
    padding_side='right')

model = dict(
    type=SupervisedFinetune,
    use_varlen_attn=use_varlen_attn,
    llm=dict(
        type=AutoModelForCausalLM.from_pretrained,
        pretrained_model_name_or_path=pretrained_model_name_or_path,
        trust_remote_code=True,
        torch_dtype=torch.float16,
        quantization_config=dict(
            type=BitsAndBytesConfig,
            load_in_4bit=True,
            load_in_8bit=False,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type='nf4')),
    lora=dict(
        type=LoraConfig,
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias='none',
        task_type='CAUSAL_LM'))

#######################################################################
#                      PART 3  Dataset & Dataloader                   #
#######################################################################
alpaca_en = dict(
    type=process_hf_dataset,
    dataset=dict(type=load_dataset, path=alpaca_en_path),
    tokenizer=tokenizer,
    max_length=max_length,
    dataset_map_fn=SYSTEM_map_fn,
    template_map_fn=dict(
        type=template_map_fn_factory, template=prompt_template),
    remove_unused_columns=True,
    shuffle_before_pack=True,
    pack_to_max_length=pack_to_max_length,
    use_varlen_attn=use_varlen_attn)

sampler = SequenceParallelSampler \
    if sequence_parallel_size > 1 else DefaultSampler

train_dataloader = dict(
    batch_size=batch_size,
    num_workers=dataloader_num_workers,
    dataset=alpaca_en,
    sampler=dict(type=sampler, shuffle=True),
    collate_fn=dict(type=default_collate_fn, use_varlen_attn=use_varlen_attn))

#######################################################################
#                    PART 4  Scheduler & Optimizer                    #
#######################################################################
# optimizer
optim_wrapper = dict(
    type=AmpOptimWrapper,
    optimizer=dict(
        type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay),
    clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False),
    accumulative_counts=accumulative_counts,
    loss_scale='dynamic',
    dtype='float16')

# learning policy
# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md  # noqa: E501
param_scheduler = [
    dict(
        type=LinearLR,
        start_factor=1e-5,
        by_epoch=True,
        begin=0,
        end=warmup_ratio * max_epochs,
        convert_to_iter_based=True),
    dict(
        type=CosineAnnealingLR,
        eta_min=0.0,
        by_epoch=True,
        begin=warmup_ratio * max_epochs,
        end=max_epochs,
        convert_to_iter_based=True)
]

# train, val, test setting
train_cfg = dict(type=TrainLoop, max_epochs=max_epochs)

#######################################################################
#                           PART 5  Runtime                           #
#######################################################################
# Log the dialogue periodically during the training process, optional
custom_hooks = [
    dict(type=DatasetInfoHook, tokenizer=tokenizer),
    dict(
        type=EvaluateChatHook,
        tokenizer=tokenizer,
        every_n_iters=evaluation_freq,
        evaluation_inputs=evaluation_inputs,
        system=SYSTEM,
        prompt_template=prompt_template)
]

if use_varlen_attn:
    custom_hooks += [dict(type=VarlenAttnArgsToMessageHubHook)]

# configure default hooks
default_hooks = dict(
    # record the time of every iteration.
    timer=dict(type=IterTimerHook),
    # print log every 10 iterations.
    logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10),
    # enable the parameter scheduler.
    param_scheduler=dict(type=ParamSchedulerHook),
    # save checkpoint per `save_steps`.
    checkpoint=dict(
        type=CheckpointHook,
        by_epoch=False,
        interval=save_steps,
        max_keep_ckpts=save_total_limit),
    # set sampler seed in distributed evrionment.
    sampler_seed=dict(type=DistSamplerSeedHook),
)

# configure environment
env_cfg = dict(
    # whether to enable cudnn benchmark
    cudnn_benchmark=False,
    # set multi process parameters
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    # set distributed parameters
    dist_cfg=dict(backend='nccl'),
)

# set visualizer
visualizer = None

# set log level
log_level = 'INFO'

# load from which checkpoint
load_from = None

# whether to resume training from the loaded checkpoint
resume = False

# Defaults to use random seed and disable `deterministic`
randomness = dict(seed=None, deterministic=False)

# set log processor
log_processor = dict(by_epoch=False)

报错:

05/15 14:21:55 - mmengine - INFO - xtuner_dataset_timeout = 1:00:00
Generating train split: 52002 examples [00:00, 100551.32 examples/s]
[rank0]: Traceback (most recent call last):
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 2011, in _prepare_split_single
[rank0]:     writer.write_table(table)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/arrow_writer.py", line 585, in write_table
[rank0]:     pa_table = table_cast(pa_table, self._schema)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/table.py", line 2295, in table_cast
[rank0]:     return cast_table_to_schema(table, schema)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/table.py", line 2249, in cast_table_to_schema
[rank0]:     raise CastError(
[rank0]: datasets.table.CastError: Couldn't cast
[rank0]: id: int64
[rank0]: createUser: string
[rank0]: eval: int64
[rank0]: mainDeptName: string
[rank0]: replyContent: string
[rank0]: eval-title: list<item: null>
[rank0]:   child 0, item: null
[rank0]: replyDeptName: string
[rank0]: title: string
[rank0]: content: string
[rank0]: publicTime: string
[rank0]: submitTime: string
[rank0]: replyTime: string
[rank0]: eval-key: list<item: null>
[rank0]:   child 0, item: null
[rank0]: eval-senti: list<item: null>
[rank0]:   child 0, item: null
[rank0]: to
[rank0]: {'instruction': Value(dtype='string', id=None), 'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
[rank0]: because column names don't match

[rank0]: During handling of the above exception, another exception occurred:

[rank0]: Traceback (most recent call last):
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank0]:     main()
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank0]:     runner.train()
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank0]:     self._train_loop = self.build_train_loop(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank0]:     loop = LOOPS.build(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank0]:     dataloader = runner.build_dataloader(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank0]:     dataset = DATASETS.build(dataset_cfg)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 308, in process_hf_dataset
[rank0]:     dataset = process(**kwargs)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 167, in process
[rank0]:     dataset = build_origin_dataset(dataset, split)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 30, in build_origin_dataset
[rank0]:     dataset = BUILDER.build(dataset)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/load.py", line 2609, in load_dataset
[rank0]:     builder_instance.download_and_prepare(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1027, in download_and_prepare
[rank0]:     self._download_and_prepare(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1122, in _download_and_prepare
[rank0]:     self._prepare_split(split_generator, **prepare_split_kwargs)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1882, in _prepare_split
[rank0]:     for job_id, done, content in self._prepare_split_single(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 2013, in _prepare_split_single
[rank0]:     raise DatasetGenerationCastError.from_cast_error(
[rank0]: datasets.exceptions.DatasetGenerationCastError: An error occurred while generating the dataset

[rank0]: All the data files must have the same columns, but at some point there are 14 new columns (id, createUser, eval, mainDeptName, replyContent, eval-title, replyDeptName, title, content, publicTime, submitTime, replyTime, eval-key, eval-senti) and 4 missing columns (input, output, text, instruction).

[rank0]: This happened while the json dataset builder was generating data using

[rank0]: /work/tzz/xtuner/data/org/qa_train.json

[rank0]: Please either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)
[rank1]:[E ProcessGroupGloo.cpp:144] Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank2]:[E ProcessGroupGloo.cpp:144] Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank3]:[E ProcessGroupGloo.cpp:144] Rank 3 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank1]: Traceback (most recent call last):
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank1]:     main()
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank1]:     runner.train()
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank1]:     self._train_loop = self.build_train_loop(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank1]:     loop = LOOPS.build(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank1]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank1]:     obj = obj_cls(**args)  # type: ignore
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank1]:     dataloader = runner.build_dataloader(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank1]:     dataset = DATASETS.build(dataset_cfg)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank1]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank1]:     obj = obj_cls(**args)  # type: ignore
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank1]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank1]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank1]: RuntimeError: Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank1]:  Original exception: 
[rank1]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:22693
[rank2]: Traceback (most recent call last):
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank2]:     main()
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank2]:     runner.train()
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank2]:     self._train_loop = self.build_train_loop(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank2]:     loop = LOOPS.build(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank2]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank2]:     obj = obj_cls(**args)  # type: ignore
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank2]:     dataloader = runner.build_dataloader(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank2]:     dataset = DATASETS.build(dataset_cfg)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank2]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank2]:     obj = obj_cls(**args)  # type: ignore
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank2]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank2]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank2]: RuntimeError: Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank2]:  Original exception: 
[rank2]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:22693
[rank3]: Traceback (most recent call last):
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank3]:     main()
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank3]:     runner.train()
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank3]:     self._train_loop = self.build_train_loop(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank3]:     loop = LOOPS.build(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank3]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank3]:     obj = obj_cls(**args)  # type: ignore
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank3]:     dataloader = runner.build_dataloader(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank3]:     dataset = DATASETS.build(dataset_cfg)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank3]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank3]:     obj = obj_cls(**args)  # type: ignore
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank3]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank3]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank3]: RuntimeError: Rank 3 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank3]:  Original exception: 
[rank3]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:22693
W0515 14:21:58.141000 139822787155776 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 30877 closing signal SIGTERM
W0515 14:21:58.141000 139822787155776 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 30878 closing signal SIGTERM
W0515 14:21:58.141000 139822787155776 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 30879 closing signal SIGTERM
E0515 14:21:58.489000 139822787155776 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 30876) of binary: /root/miniconda3/envs/xtuner/bin/python
Traceback (most recent call last):
  File "/root/miniconda3/envs/xtuner/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
    run(args)
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-05-15_14:21:58
  host      : a19780ffc442
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 30876)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

将数据集换成自定义数据集:

{"instruction": "你的任务是重庆市政务文书写作、政务问答 \n 你生成的问题必须包含:1、留言标题,2、留言摘要。你生成的答复内容部分必须有法律依据,且表明已审查,参照你固有的知识或者我给出的法律文献,在引用法律文件时使用《》包裹其名称。仅一种情况除外:当问题类型为需人类实际行动并给出反馈时,不需要强行引用法律。", "input": "拆迁方面咨询\n本人老家南川区木凉镇,最近镇政府给我们做了一个动员大会,说要征用我们的房屋及土地(木凉街道后面那一片),用于修建别墅区。还给我们发了一个关于拆迁的宣传手册。其中关于房屋的赔付标准,我们发现该标准和08年区府出的《南川府发 〔2008〕 94号》中关于房屋的拆迁货币安置房屋补偿标准相差无几。 请问在这个房价一天一涨,物价也越来越高的时候,还以08年那个年代的标准来赔付,适用吗?拆迁本着以不加重我们农民的负担的情况下来进行的吧。", "output": "同志:\r\n    你好!来信收悉,现回复如下:\r\n  1、“木凉镇漱玉湖周边地块土地整治项目”房屋搬迁经南川府办函[2018]13号批准,实施房屋搬迁,搬迁人区城投公司,实施单位木凉镇人民政府。\r\n  2、“木凉镇漱玉湖周边地块土地整治项目”房屋搬迁参照执行政策为《重庆市南川区集体土地上房屋拆迁补偿安置实施办法》(南川府办发〔2016〕125号),并未按南川府发〔2008〕94号标准执行,南川府办发〔2016〕125号第三十四条明确规定“本办法自2017年1月1日起施行。原《重庆市南川区人民政府关于调整征地拆迁房屋及其他地上构(附)着物补偿安置政策有关事项的通知》(南川府发〔2008〕63号)和《重庆市南川区人民政府关于调整征地拆迁房屋及其他地上构(附)着物补偿安置政策有关事项的通知》(南川府发〔2008〕94号)同时废止。”  \r\n  3、房屋补偿指对房屋重置价格补偿,南川府发〔2008〕94号只对房屋进行补偿;南川府办发〔2016〕125号在安置方式进行调整,按照“人房分离、按房补偿、以人安置”的原则在对房屋进行补偿后,还要对人员进行住房安置。\r\n    感谢你对我区征地补偿安置的支持,若您仍有疑问,可向区土地和房屋征收中心咨询。"}

由于我观察到映射并未使用text,因此我没有加入此字段:

def alpaca_map_fn(example):
    if example.get('output') == '<nooutput>':
        return {'conversation': []}
    else:
        return {
            'conversation': [{
                'input': f"{example['instruction']}\n{example['input']}",
                'output': example['output']
            }]
        }

报错:

05/15 16:24:34 - mmengine - INFO - xtuner_dataset_timeout = 1:00:00
Generating train split: 52002 examples [00:00, 91212.06 examples/s]
[rank0]: Traceback (most recent call last):
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 2011, in _prepare_split_single
[rank0]:     writer.write_table(table)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/arrow_writer.py", line 585, in write_table
[rank0]:     pa_table = table_cast(pa_table, self._schema)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/table.py", line 2295, in table_cast
[rank0]:     return cast_table_to_schema(table, schema)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/table.py", line 2249, in cast_table_to_schema
[rank0]:     raise CastError(
[rank0]: datasets.table.CastError: Couldn't cast
[rank0]: eval-senti: list<item: null>
[rank0]:   child 0, item: null
[rank0]: eval-key: list<item: null>
[rank0]:   child 0, item: null
[rank0]: id: int64
[rank0]: publicTime: string
[rank0]: eval-title: list<item: null>
[rank0]:   child 0, item: null
[rank0]: submitTime: string
[rank0]: replyDeptName: string
[rank0]: replyTime: string
[rank0]: createUser: string
[rank0]: title: string
[rank0]: mainDeptName: string
[rank0]: replyContent: string
[rank0]: content: string
[rank0]: eval: int64
[rank0]: to
[rank0]: {'instruction': Value(dtype='string', id=None), 'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
[rank0]: because column names don't match

[rank0]: During handling of the above exception, another exception occurred:

[rank0]: Traceback (most recent call last):
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank0]:     main()
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank0]:     runner.train()
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank0]:     self._train_loop = self.build_train_loop(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank0]:     loop = LOOPS.build(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank0]:     dataloader = runner.build_dataloader(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank0]:     dataset = DATASETS.build(dataset_cfg)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 308, in process_hf_dataset
[rank0]:     dataset = process(**kwargs)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 167, in process
[rank0]:     dataset = build_origin_dataset(dataset, split)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 30, in build_origin_dataset
[rank0]:     dataset = BUILDER.build(dataset)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/load.py", line 2609, in load_dataset
[rank0]:     builder_instance.download_and_prepare(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1027, in download_and_prepare
[rank0]:     self._download_and_prepare(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1122, in _download_and_prepare
[rank0]:     self._prepare_split(split_generator, **prepare_split_kwargs)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1882, in _prepare_split
[rank0]:     for job_id, done, content in self._prepare_split_single(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 2013, in _prepare_split_single
[rank0]:     raise DatasetGenerationCastError.from_cast_error(
[rank0]: datasets.exceptions.DatasetGenerationCastError: An error occurred while generating the dataset

[rank0]: All the data files must have the same columns, but at some point there are 14 new columns (eval-senti, eval-key, id, publicTime, eval-title, submitTime, replyDeptName, replyTime, createUser, title, mainDeptName, replyContent, content, eval) and 4 missing columns (text, instruction, output, input).

[rank0]: This happened while the json dataset builder was generating data using

[rank0]: /work/tzz/xtuner/data/org/qa_train.json

[rank0]: Please either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)
[rank1]:[E ProcessGroupGloo.cpp:144] Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank2]:[E ProcessGroupGloo.cpp:144] Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank3]:[E ProcessGroupGloo.cpp:144] Rank 3 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank1]: Traceback (most recent call last):
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank1]:     main()
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank1]:     runner.train()
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank1]:     self._train_loop = self.build_train_loop(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank1]:     loop = LOOPS.build(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank1]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank1]:     obj = obj_cls(**args)  # type: ignore
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank1]:     dataloader = runner.build_dataloader(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank1]:     dataset = DATASETS.build(dataset_cfg)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank1]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank1]:     obj = obj_cls(**args)  # type: ignore
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank1]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank1]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank1]: RuntimeError: Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank1]:  Original exception: 
[rank1]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:44265
[rank2]: Traceback (most recent call last):
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank2]:     main()
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank2]:     runner.train()
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank2]:     self._train_loop = self.build_train_loop(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank2]:     loop = LOOPS.build(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank2]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank2]:     obj = obj_cls(**args)  # type: ignore
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank2]:     dataloader = runner.build_dataloader(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank2]:     dataset = DATASETS.build(dataset_cfg)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank2]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank2]:     obj = obj_cls(**args)  # type: ignore
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank2]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank2]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank2]: RuntimeError: Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank2]:  Original exception: 
[rank2]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:50872
[rank3]: Traceback (most recent call last):
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank3]:     main()
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank3]:     runner.train()
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank3]:     self._train_loop = self.build_train_loop(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank3]:     loop = LOOPS.build(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank3]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank3]:     obj = obj_cls(**args)  # type: ignore
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank3]:     dataloader = runner.build_dataloader(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank3]:     dataset = DATASETS.build(dataset_cfg)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank3]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank3]:     obj = obj_cls(**args)  # type: ignore
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank3]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank3]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank3]: RuntimeError: Rank 3 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank3]:  Original exception: 
[rank3]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:55991
E0515 16:24:42.172000 139680570062656 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 42319) of binary: /root/miniconda3/envs/xtuner/bin/python
Traceback (most recent call last):
  File "/root/miniconda3/envs/xtuner/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
    run(args)
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2024-05-15_16:24:42
  host      : a19780ffc442
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 42320)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
  time      : 2024-05-15_16:24:42
  host      : a19780ffc442
  rank      : 2 (local_rank: 2)
  exitcode  : 1 (pid: 42321)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
  time      : 2024-05-15_16:24:42
  host      : a19780ffc442
  rank      : 3 (local_rank: 3)
  exitcode  : 1 (pid: 42322)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-05-15_16:24:42
  host      : a19780ffc442
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 42319)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

提示列明不一致,但是映射并未看到使用text

使用原本默认数据集会报错。

LumenScope commented 6 months ago

补充:修改映射为

def SYSTEM_map_fn(example):
    return {
        'conversation': [{
            'system': f"{example['instruction']}",
            'input': f"{example['input']}",
            'output': example['output']
        }]
    }

之后再使用自定义数据集仍然报错:

05/15 16:27:56 - mmengine - INFO - xtuner_dataset_timeout = 1:00:00
Generating train split: 52002 examples [00:00, 96581.19 examples/s] 
[rank0]: Traceback (most recent call last):
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 2011, in _prepare_split_single
[rank0]:     writer.write_table(table)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/arrow_writer.py", line 585, in write_table
[rank0]:     pa_table = table_cast(pa_table, self._schema)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/table.py", line 2295, in table_cast
[rank0]:     return cast_table_to_schema(table, schema)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/table.py", line 2249, in cast_table_to_schema
[rank0]:     raise CastError(
[rank0]: datasets.table.CastError: Couldn't cast
[rank0]: submitTime: string
[rank0]: eval-key: list<item: null>
[rank0]:   child 0, item: null
[rank0]: content: string
[rank0]: title: string
[rank0]: eval-title: list<item: null>
[rank0]:   child 0, item: null
[rank0]: eval-senti: list<item: null>
[rank0]:   child 0, item: null
[rank0]: eval: int64
[rank0]: createUser: string
[rank0]: replyContent: string
[rank0]: replyDeptName: string
[rank0]: mainDeptName: string
[rank0]: publicTime: string
[rank0]: id: int64
[rank0]: replyTime: string
[rank0]: to
[rank0]: {'instruction': Value(dtype='string', id=None), 'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None)}
[rank0]: because column names don't match

[rank0]: During handling of the above exception, another exception occurred:

[rank0]: Traceback (most recent call last):
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank0]:     main()
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank0]:     runner.train()
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank0]:     self._train_loop = self.build_train_loop(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank0]:     loop = LOOPS.build(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank0]:     dataloader = runner.build_dataloader(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank0]:     dataset = DATASETS.build(dataset_cfg)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 308, in process_hf_dataset
[rank0]:     dataset = process(**kwargs)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 167, in process
[rank0]:     dataset = build_origin_dataset(dataset, split)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 30, in build_origin_dataset
[rank0]:     dataset = BUILDER.build(dataset)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank0]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank0]:     obj = obj_cls(**args)  # type: ignore
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/load.py", line 2609, in load_dataset
[rank0]:     builder_instance.download_and_prepare(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1027, in download_and_prepare
[rank0]:     self._download_and_prepare(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1122, in _download_and_prepare
[rank0]:     self._prepare_split(split_generator, **prepare_split_kwargs)
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 1882, in _prepare_split
[rank0]:     for job_id, done, content in self._prepare_split_single(
[rank0]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/datasets/builder.py", line 2013, in _prepare_split_single
[rank0]:     raise DatasetGenerationCastError.from_cast_error(
[rank0]: datasets.exceptions.DatasetGenerationCastError: An error occurred while generating the dataset

[rank0]: All the data files must have the same columns, but at some point there are 14 new columns (submitTime, eval-key, content, eval-title, eval-senti, title, eval, createUser, replyContent, replyDeptName, mainDeptName, publicTime, id, replyTime) and 4 missing columns (text, output, input, instruction).

[rank0]: This happened while the json dataset builder was generating data using

[rank0]: /work/tzz/xtuner/data/org/qa_train.json

[rank0]: Please either edit the data files to have matching columns, or separate them into different configurations (see docs at https://hf.co/docs/hub/datasets-manual-configuration#multiple-configurations)
[rank1]:[E ProcessGroupGloo.cpp:144] Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank2]:[E ProcessGroupGloo.cpp:144] Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank3]:[E ProcessGroupGloo.cpp:144] Rank 3 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank1]: Traceback (most recent call last):
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank1]:     main()
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank1]:     runner.train()
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank1]:     self._train_loop = self.build_train_loop(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank1]:     loop = LOOPS.build(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank1]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank1]:     obj = obj_cls(**args)  # type: ignore
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank1]:     dataloader = runner.build_dataloader(
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank1]:     dataset = DATASETS.build(dataset_cfg)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank1]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank1]:     obj = obj_cls(**args)  # type: ignore
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank1]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank1]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank1]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank1]: RuntimeError: Rank 1 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank1]:  Original exception: 
[rank1]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:19629
[rank2]: Traceback (most recent call last):
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank2]:     main()
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank2]:     runner.train()
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank2]:     self._train_loop = self.build_train_loop(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank2]:     loop = LOOPS.build(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank2]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank2]:     obj = obj_cls(**args)  # type: ignore
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank2]:     dataloader = runner.build_dataloader(
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank2]:     dataset = DATASETS.build(dataset_cfg)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank2]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank2]:     obj = obj_cls(**args)  # type: ignore
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank2]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank2]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank2]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank2]: RuntimeError: Rank 2 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank2]:  Original exception: 
[rank2]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:23260
[rank3]: Traceback (most recent call last):
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 360, in <module>
[rank3]:     main()
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py", line 356, in main
[rank3]:     runner.train()
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 1160, in train
[rank3]:     self._train_loop = self.build_train_loop(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 958, in build_train_loop
[rank3]:     loop = LOOPS.build(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank3]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank3]:     obj = obj_cls(**args)  # type: ignore
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/engine/runner/loops.py", line 32, in __init__
[rank3]:     dataloader = runner.build_dataloader(
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/runner/_flexible_runner.py", line 824, in build_dataloader
[rank3]:     dataset = DATASETS.build(dataset_cfg)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
[rank3]:     return self.build_func(cfg, *args, **kwargs, registry=self)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
[rank3]:     obj = obj_cls(**args)  # type: ignore
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/dataset/huggingface.py", line 313, in process_hf_dataset
[rank3]:     dist.monitored_barrier(group=group_gloo, timeout=xtuner_dataset_timeout)
[rank3]:   File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 3763, in monitored_barrier
[rank3]:     return group_to_use.monitored_barrier(timeout, wait_all_ranks=wait_all_ranks)
[rank3]: RuntimeError: Rank 3 successfully reached monitoredBarrier, but received errors while waiting for send/recv from rank 0. Please check rank 0 logs for faulty rank.
[rank3]:  Original exception: 
[rank3]: [../third_party/gloo/gloo/transport/tcp/pair.cc:534] Connection closed by peer [172.17.0.2]:51333
E0515 16:28:04.559000 140122573240128 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 0 (pid: 43164) of binary: /root/miniconda3/envs/xtuner/bin/python
Traceback (most recent call last):
  File "/root/miniconda3/envs/xtuner/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
    return f(*args, **kwargs)
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/run.py", line 879, in main
    run(args)
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/run.py", line 870, in run
    elastic_launch(
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
/root/miniconda3/envs/xtuner/lib/python3.10/site-packages/xtuner/tools/train.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2024-05-15_16:28:04
  host      : a19780ffc442
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 43165)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[2]:
  time      : 2024-05-15_16:28:04
  host      : a19780ffc442
  rank      : 2 (local_rank: 2)
  exitcode  : 1 (pid: 43166)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
[3]:
  time      : 2024-05-15_16:28:04
  host      : a19780ffc442
  rank      : 3 (local_rank: 3)
  exitcode  : 1 (pid: 43167)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-05-15_16:28:04
  host      : a19780ffc442
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 43164)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
pppppM commented 5 months ago

@LumenScope 首先,是 map fn 的定义方式,mmengine 的 config 没有办法在 config 文件内定义新的函数,只能通过 import 的方式,具体见

https://github.com/InternLM/xtuner/tree/main/examples/demo_data/multi_turn_2#config

其次,对于自定义的数据集,可以通过 xtuner check-custom-dataset $CONFIG 检查格式哪里有错误

最后,可以通过 xtuner log-dataset $CONFIG 来查看转换后的数据样式