InternLM / xtuner

An efficient, flexible and full-featured toolkit for fine-tuning LLM (InternLM2, Llama3, Phi3, Qwen, Mistral, ...)
https://xtuner.readthedocs.io/zh-cn/latest/
Apache License 2.0
3.64k stars 297 forks source link

process_untokenized_datasets使用自定义map_fn时无法正确导出新config #810

Open SingL3 opened 1 month ago

SingL3 commented 1 month ago

使用以下方法导入map_fn:

with read_base():
    from .my_map_fn import my_map_fn

运行process_untokenized_datasets,导出数据后,报错:

Processed dataset has been saved to /mnt/home/xxxxxxx/llm/xtuner/examples/test
Traceback (most recent call last):
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/yapf/pytree/pytree_utils.py", line 113, in ParseCodeToTree
    tree = parser_driver.parse_string(code, debug=False)
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/yapf_third_party/_ylib2to3/pgen2/driver.py", line 188, in parse_string
    return self.parse_tokens(tokens, debug)
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/yapf_third_party/_ylib2to3/pgen2/driver.py", line 157, in parse_tokens
    if p.addtoken(type, value, (prefix, start)):
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/yapf_third_party/_ylib2to3/pgen2/parse.py", line 230, in addtoken
    return self._addtoken(ilabel, type, value, context)
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/yapf_third_party/_ylib2to3/pgen2/parse.py", line 313, in _addtoken
    raise ParseError('bad input', type, value, context)
yapf_third_party._ylib2to3.pgen2.parse.ParseError: bad input: type=20, value='<', context=('', (91, 14))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/yapf/yapflib/yapf_api.py", line 198, in FormatCode
    tree = pytree_utils.ParseCodeToTree(unformatted_source)
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/yapf/pytree/pytree_utils.py", line 116, in ParseCodeToTree
    ast.parse(code)
  File "/mnt/data/conda/envs/llm/lib/python3.10/ast.py", line 50, in parse
    return compile(source, filename, mode, flags,
  File "<unknown>", line 91
    my_map_fn=<function my_map_fn at 0x7f3f46508ca0>
                  ^
SyntaxError: invalid syntax

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/mmengine/config/config.py", line 1483, in pretty_text
    text, _ = FormatCode(text, style_config=yapf_style)
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/yapf/yapflib/yapf_api.py", line 201, in FormatCode
    raise errors.YapfError(errors.FormatErrorMsg(e))
yapf.yapflib.errors.YapfError: <unknown>:91:15: invalid syntax

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/mnt/home/xxxxxxx/llm/xtuner/xtuner/tools/process_untokenized_datasets.py", line 73, in <module>
    modified_cfg.dump(modified_cfg_save_path)
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/mmengine/config/config.py", line 1575, in dump
    f.write(self.pretty_text)
  File "/mnt/data/conda/envs/llm/lib/python3.10/site-packages/mmengine/config/config.py", line 1488, in pretty_text
    raise SyntaxError('Failed to format the config file, please '
SyntaxError: Failed to format the config file, please check the syntax of:
accumulative_counts=4
batch_size=16
betas=(
    0.9,
    0.999,
    )
custom_hooks=[
    dict(tokenizer=dict(
            padding_side='right',
            pretrained_model_name_or_path='/mnt/data/xxxxxxx/llm/pretrained_model/internlm2_5-7b',
            trust_remote_code=True,
            type='transformers.AutoTokenizer.from_pretrained'),
        type='xtuner.engine.hooks.DatasetInfoHook'),
    ]
dataloader_num_workers=2
default_hooks=dict(
    checkpoint=dict(
        by_epoch=False,
        interval=1000,
        max_keep_ckpts=2,
        type='mmengine.hooks.CheckpointHook'),
    logger=dict(
        interval=10,
        log_metric_by_epoch=False,
        type='mmengine.hooks.LoggerHook'),
    param_scheduler=dict(
        type='mmengine.hooks.ParamSchedulerHook'),
    sampler_seed=dict(
        type='mmengine.hooks.DistSamplerSeedHook'),
    timer=dict(
        type='mmengine.hooks.IterTimerHook'))
env_cfg=dict(
    cudnn_benchmark=False,
    dist_cfg=dict(
        backend='nccl'),
    mp_cfg=dict(
        mp_start_method='fork',
        opencv_num_threads=0))
load_from=None
log_level='INFO'
log_processor=dict(
    by_epoch=False)
lr=1e-05
max_epochs=4
max_length=512
max_norm=1
model=dict(
    llm=dict(
        pretrained_model_name_or_path='/mnt/data/xxxxxxx/llm/pretrained_model/internlm2_5-7b',
        trust_remote_code=True,
        type='transformers.AutoModelForCausalLM.from_pretrained'),
    type='xtuner.model.SupervisedFinetune',
    use_varlen_attn=False)
optim_type='torch.optim.AdamW'
optim_wrapper=dict(
    accumulative_counts=4,
    clip_grad=dict(
        error_if_nonfinite=False,
        max_norm=1),
    dtype='bfloat16',
    loss_scale='dynamic',
    optimizer=dict(
        betas=(
            0.9,
            0.999,
            ),
        lr=1e-05,
        type='torch.optim.AdamW',
        weight_decay=0),
    type='mmengine.optim.AmpOptimWrapper')
pack_to_max_length=False
param_scheduler=[
    dict(begin=0,
        by_epoch=True,
        convert_to_iter_based=True,
        end=0.12,
        start_factor=0.025,
        type='mmengine.optim.LinearLR'),
    dict(begin=0.12,
        by_epoch=True,
        convert_to_iter_based=True,
        end=4,
        eta_min=1.0000000000000002e-06,
        type='mmengine.optim.CosineAnnealingLR'),
    ]
pretrained_model_name_or_path='/mnt/data/xxxxxxx/llm/pretrained_model/internlm2_5-7b'
prompt_template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat'
randomness=dict(
    deterministic=False,
    seed=None)
my_map_fn=<function my_map_fn at 0x7f3f46508ca0>
resume=False
sampler='mmengine.dataset.DefaultSampler'
save_steps=1000
save_total_limit=2
sequence_parallel_size=1
tokenizer=dict(
    padding_side='right',
    pretrained_model_name_or_path='/mnt/data/xxxxxxx/llm/pretrained_model/internlm2_5-7b',
    trust_remote_code=True,
    type='transformers.AutoTokenizer.from_pretrained')
train_cfg=dict(
    max_epochs=4,
    type='xtuner.engine.runner.TrainLoop')
train_data_files=[
    '/mnt/home/xxxxxxx/llm/xtuner/xtuner/configs/llm/internlm2_5_7b/example_data.jsonl',
    ]
train_dataloader=dict(
    batch_size=16,
    collate_fn=dict(
        type='xtuner.dataset.collate_fns.default_collate_fn',
        use_varlen_attn=False),
    dataset=dict(
        dataset=dict(
            dataset_path='/mnt/home/xxxxxxx/llm/xtuner/examples/test',
            type='datasets.load_from_disk'),
        dataset_map_fn=None,
        do_dataset_tokenization=False,
        input_ids_with_output=False,
        max_dataset_length=None,
        max_length=None,
        pack_to_max_length=False,
        remove_unused_columns=False,
        rename_maps=[
            ],
        split=None,
        template_map_fn=None,
        tokenizer=None,
        type='xtuner.dataset.process_hf_dataset'),
    num_workers=2,
    sampler=dict(
        seed=1024,
        shuffle=True,
        type='xtuner.dataset.samplers.InternRepoSampler'))
train_dataset=dict(
    dataset=dict(
        data_files=[
            '/mnt/home/xxxxxxx/llm/xtuner/xtuner/configs/llm/internlm2_5_7b/example_data.jsonl',
            ],
        path='json',
        type='datasets.load_dataset'),
    dataset_map_fn=<function my_map_fn at 0x7f3f46508ca0>,
    max_length=512,
    pack_to_max_length=False,
    remove_unused_columns=True,
    shuffle_before_pack=True,
    template_map_fn=dict(
        template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat',
        type='xtuner.dataset.map_fns.template_map_fn_factory'),
    tokenizer=dict(
        padding_side='right',
        pretrained_model_name_or_path='/mnt/data/xxxxxxx/llm/pretrained_model/internlm2_5-7b',
        trust_remote_code=True,
        type='transformers.AutoTokenizer.from_pretrained'),
    type='xtuner.dataset.process_hf_dataset',
    use_varlen_attn=False)
use_varlen_attn=False
visualizer=dict(
    type='mmengine.visualization.Visualizer',
    vis_backends=[
        dict(type='mmengine.visualization.TensorboardVisBackend'),
        ])
warmup_ratio=0.03
weight_decay=0
fanqiNO1 commented 1 month ago

把 from .my_map_fn import my_map_fn 从 with read_base(): 块中移出试试看?