!/bin/bash

GPUS_PER_NODE=2 NNODES=1 NODE_RANK=0 MASTER_ADDR=localhost MASTER_PORT=6001

MODEL="/root/MiniCPM-V/pretrained_weights/MiniCPM-V-2_6" # or openbmb/MiniCPM-V-2, openbmb/MiniCPM-Llama3-V-2_5

DATA="/root/MiniCPM-V/exam/mllm_demo.json" EVAL_DATA="/root/MiniCPM-V/exam/mllm_demo.json" LLM_TYPE="qwen2" export NCCL_P2P_DISABLE=1 export NCCL_IB_DISABLE=1

DISTRIBUTED_ARGS=" --nproc_per_node $GPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ --master_port $MASTER_PORT " torchrun $DISTRIBUTED_ARGS finetune.py \ --model_name_or_path $MODEL \ --llm_type $LLM_TYPE \ --data_path $DATA \ --eval_data_path $EVAL_DATA \ --remove_unused_columns false \ --label_names "labels" \ --prediction_loss_only false \ --bf16 false \ --bf16_full_eval false \ --fp16 true \ --fp16_full_eval true \ --do_train \ --do_eval \ --tune_vision true \ --tune_llm false \ --use_lora true \ --lora_target_modules "llm..*layers.\d+.self_attn.(q_proj|k_proj|v_proj|o_proj)" \ --model_max_length 2048 \ --max_slice_nums 9 \ --max_steps 10 \ --eval_steps 1000 \ --output_dir output/output__lora \ --logging_dir output/output_lora \ --logging_strategy "steps" \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --gradient_accumulation_steps 200 \ --evaluation_strategy "steps" \ --save_strategy "steps" \ --save_steps 1000 \ --save_total_limit 10 \ --learning_rate 1e-6 \ --weight_decay 0.1 \ --adam_beta2 0.95 \ --warmup_ratio 0.01 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --gradient_checkpointing true \ --deepspeed ds_config_zero3_offload.json \ --report_to "tensorboard" # wandb

报错信息

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [11:04<00:00, 33.20s/it] [rank0]:[E ProcessGroupNCCL.cpp:563] [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=52989, OpType=_ALLGATHER_BASE, NumelIn=181190315, NumelOut=543570945, Timeout(ms)=1800000) ran for 1800056 milliseconds before timing out. [rank0]:[E ProcessGroupNCCL.cpp:1537] [PG 0 Rank 0] Timeout at NCCL work: 52989, last enqueued NCCL work: 52989, last completed NCCL work: 52988. [rank0]:[E ProcessGroupNCCL.cpp:577] [Rank 0] Some NCCL operations have failed or timed out. Due to the asynchronous nature of CUDA kernels, subsequent GPU operations might run on corrupted/incomplete data. [rank0]:[E ProcessGroupNCCL.cpp:583] [Rank 0] To avoid data inconsistency, we are taking the entire process down. [rank0]:[E ProcessGroupNCCL.cpp:1414] [PG 0 Rank 0] Process group watchdog thread terminated with exception: [Rank 0] Watchdog caught collective operation timeout: WorkNCCL(SeqNum=52989, OpType=_ALLGATHER_BASE, NumelIn=181190315, NumelOut=543570945, Timeout(ms)=1800000) ran for 1800056 milliseconds before timing out.

zhaoyangwei123 commented 3 months ago

@xiaobaixue 您好，请问您这里的数据集DATA="/root/MiniCPM-V/exam/mllm_demo.json" EVAL_DATA="/root/MiniCPM-V/exam/mllm_demo.json"是您自己制作的还是官方的呢？，我看官方只是在swift微调的时候给了coco-en-mini 数据集，但是该数据集下载下来也不是json格式，与finetune要求的json格式似乎不一样呢

xiaobaixue commented 3 months ago

自己的数据集，按照官方的训练教程自己创建的 https://modelbest.feishu.cn/wiki/HvfLwYzlIihqzXkmeCdczs6onmd

xiaobaixue commented 3 months ago

已经解决了，两个4090的卡不行，换了两个3090的卡就好使了，莫名其妙。

OpenBMB / MiniCPM-V

💡 [REQUEST]微调进度100%后报错 Watchdog caught collective operation timeout: WorkNCCL #533

训练脚本

!/bin/bash

报错信息