指令微调发生错误

提交前必须检查以下项目

[X] 请确保使用的是仓库最新代码（git pull），一些问题已被解决和修复。
[X] 由于相关依赖频繁更新，请确保按照Wiki中的相关步骤执行
[X] 我已阅读FAQ章节并且已在Issue中对问题进行了搜索，没有找到相似问题和解决方案
[X] 第三方插件问题：例如llama.cpp、text-generation-webui、LlamaChat等，同时建议到对应的项目中查找解决方案
[X] 模型正确性检查：务必检查模型的SHA256.md，模型不对的情况下无法保证效果和正常运行

问题类型

模型训练与精调

基础模型

Alpaca-7B

操作系统

Linux

详细描述问题

# 运行脚本前请仔细阅读wiki(https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/sft_scripts_zh)
# Read the wiki(https://github.com/ymcui/Chinese-LLaMA-Alpaca-2/wiki/sft_scripts_zh) carefully before running the script
lr=1e-4
# lora_rank=64
# lora_alpha=128
lora_rank=8
lora_alpha=32
#lora_trainable="q_proj,v_proj,k_proj,o_proj,gate_proj,down_proj,up_proj"
lora_trainable="q_proj,v_proj"
# modules_to_save="embed_tokens,lm_head"
lora_dropout=0.05

pretrained_model=/usr/aplaca/chinese-alpaca-2-7b-hf
chinese_tokenizer_path=/usr/aplaca/chinese-alpaca-2-7b-hf/tokenizer.model
dataset_dir=/usr/aplaca/data
per_device_train_batch_size=1
per_device_eval_batch_size=1
gradient_accumulation_steps=8
max_seq_length=512
output_dir=/usr/aplaca/output
validation_file=/usr/aplaca/eval/data.json

deepspeed_config_file=ds_zero2_no_offload.json

torchrun --nnodes 1 --nproc_per_node 1 run_clm_sft_with_peft.py \
    --deepspeed ${deepspeed_config_file} \
    --model_name_or_path ${pretrained_model} \
    --tokenizer_name_or_path ${chinese_tokenizer_path} \
    --dataset_dir ${dataset_dir} \
    --per_device_train_batch_size ${per_device_train_batch_size} \
    --per_device_eval_batch_size ${per_device_eval_batch_size} \
    --do_train \
    --do_eval \
    --seed $RANDOM \
    --fp16 \
    --num_train_epochs 1 \
    --lr_scheduler_type cosine \
    --learning_rate ${lr} \
    --warmup_ratio 0.03 \
    --weight_decay 0 \
    --logging_strategy steps \
    --logging_steps 10 \
    --save_strategy steps \
    --save_total_limit 3 \
    --evaluation_strategy steps \
    --eval_steps 100 \
    --save_steps 200 \
    --gradient_accumulation_steps ${gradient_accumulation_steps} \
    --preprocessing_num_workers 8 \
    --max_seq_length ${max_seq_length} \
    --output_dir ${output_dir} \
    --overwrite_output_dir \
    --ddp_timeout 30000 \
    --logging_first_step True \
    # --lora_rank ${lora_rank} \
    # --lora_alpha ${lora_alpha} \
    # --trainable ${lora_trainable} \
    # --lora_dropout ${lora_dropout} \
    # --modules_to_save ${modules_to_save} \
    --torch_dtype float16 \
    --validation_file ${validation_file} \
    --load_in_kbits 16 \
    --save_safetensors False \
    # --gradient_checkpointing \
    --ddp_find_unused_parameters False

依赖情况（代码类问题务必提供）

peft==0.3.0
torch==2.0.1
transformers==4.35.0
sentencepiece==0.1.99
bitsandbytes==0.41.1

运行日志或截图

root@7cfc54b5443d:/usr/aplaca/training# bash run_sft.sh
[2024-03-19 13:38:59,215] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-03-19 13:39:02,745] [INFO] [comm.py:637:init_distributed] cdb=None
[2024-03-19 13:39:02,746] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
03/19/2024 13:41:12 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True
[INFO|configuration_utils.py:715] 2024-03-19 13:41:12,156 >> loading configuration file /usr/aplaca/chinese-alpaca-2-7b-hf/config.json
[INFO|configuration_utils.py:777] 2024-03-19 13:41:12,159 >> Model config LlamaConfig {
  "_name_or_path": "/usr/aplaca/chinese-alpaca-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.35.0",
  "use_cache": true,
  "vocab_size": 55296
}

/usr/local/lib/python3.8/dist-packages/transformers/tokenization_utils_base.py:1925: FutureWarning: Calling LlamaTokenizer.from_pretrained() with the path to a single file or url is deprecated and won't be possible anymore in v5. Use a model identifier or the path to a directory instead.
  warnings.warn(
[INFO|tokenization_utils_base.py:2022] 2024-03-19 13:41:12,161 >> loading file /usr/aplaca/chinese-alpaca-2-7b-hf/tokenizer.model from cache at /usr/aplaca/chinese-alpaca-2-7b-hf/tokenizer.model
[WARNING|logging.py:329] 2024-03-19 13:41:12,163 >> You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
03/19/2024 13:41:12 - INFO - __main__ - Training files: /usr/aplaca/data/data.json
03/19/2024 13:41:12 - WARNING - root - building dataset...
03/19/2024 13:41:12 - INFO - __name__ - training datasets-/usr/aplaca/data/data.json has been loaded from disk
03/19/2024 13:41:12 - INFO - __main__ - Num train_samples  50
03/19/2024 13:41:12 - INFO - __main__ - Training example:
03/19/2024 13:41:12 - INFO - __main__ - <s> [INST] <<SYS>>
You are a helpful assistant. 你是一个乐于助人的助手。
<</SYS>>

请提取文本中的关键敏感信息，以json格式返回
策划：武汉大学采购与招投标管理中心风景摄影：许志强技术支持：北京阳光公采科技有限公司建议使用chrome、firefox等浏览器电话：400-666-0839工作日:9:00-17:30服务邮箱：service_edu@bosssoft.com.cn [/INST] [{"策划":"武汉大学采购与招投标管理中心","电话":"400-666-0839","邮箱":"service_edu@bossoft.com,cn"}]</s>
Traceback (most recent call last):
  File "run_clm_sft_with_peft.py", line 513, in <module>
    main()
  File "run_clm_sft_with_peft.py", line 365, in main
    logger.info(f"Evaluation files: {' '.join(files)}")
TypeError: sequence item 0: expected str instance, NoneType found
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 4314) of binary: /usr/bin/python3
Traceback (most recent call last):
  File "/usr/local/bin/torchrun", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.8/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/torch/distributed/run.py", line 794, in main
    run(args)
  File "/usr/local/lib/python3.8/dist-packages/torch/distributed/run.py", line 785, in run
    elastic_launch(
  File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/usr/local/lib/python3.8/dist-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
run_clm_sft_with_peft.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2024-03-19_13:41:13
  host      : 7cfc54b5443d
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 4314)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

ymcui / Chinese-LLaMA-Alpaca

指令微调发生错误 #887

提交前必须检查以下项目

问题类型

基础模型

操作系统

详细描述问题

依赖情况（代码类问题务必提供）

运行日志或截图