gangxu822 commented 2 months ago

tee -a /home/xugang/InternVL/internvl_chat/shell/internlm2_20b_dynamic/entity_extract_exps//training_log.txt W0620 10:27:31.496586 140248178374464 torch/distributed/run.py:757] W0620 10:27:31.496586 140248178374464 torch/distributed/run.py:757] W0620 10:27:31.496586 140248178374464 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. W0620 10:27:31.496586 140248178374464 torch/distributed/run.py:757] [E socket.cpp:957] [c10d] The client socket has timed out after 900s while trying to connect to (10.71.108.181, 60994). Traceback (most recent call last): File "/usr/local/bin/torchrun", line 33, in sys.exit(load_entry_point('torch==2.3.1', 'console_scripts', 'torchrun')()) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper return f(*args, kwargs) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/run.py", line 879, in main run(args) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/run.py", line 870, in run elastic_launch( File "/usr/local/lib/python3.9/dist-packages/torch/distributed/launcher/api.py", line 132, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/launcher/api.py", line 254, in launch_agent result = agent.run() File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper result = f(*args, *kwargs) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run result = self._invoke_run(role) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run self._initialize_workers(self._worker_group) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper result = f(args, kwargs) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/agent/server/api.py", line 705, in _initialize_workers self._rendezvous(worker_group) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper result = f(*args, **kwargs) File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/agent/server/api.py", line 548, in _rendezvous store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous() File "/usr/local/lib/python3.9/dist-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 55, in next_rendezvous self._store = TCPStore( # type: ignore[call-arg] torch.distributed.DistNetworkError: The client socket has timed out after 900s while trying to connect to (10.71.108.181, 60994).

gangxu822 commented 2 months ago

脚本：OUTPUT_DIR='/opt/cv/InternVL/internvl_chat/shell/internlm2_20b_dynamic/entity_extract_exps/'

rm -r /opt/cv/grounding_exps/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune_0618/*

if [ ! -d "$OUTPUT_DIR" ]; then mkdir -p "$OUTPUT_DIR" fi

bash shell/internlm2_20b_dynamic/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh

torchrun $DISTRIBUTED_ARGS /opt/cv/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py \ --model_name_or_path "/mnt/data0/models--OpenGVLab--InternVL-Chat-V1-5/snapshots/InternVL-Chat-V1-5/" \ --conv_style "internlm2-chat" \ --output_dir ${OUTPUT_DIR} \ --meta_path "/mnt/data0/event_entity_extraction_internvl1-5-demo.jsonl" \ --overwrite_output_dir True \ --force_image_size 448 \ --max_dynamic_patch 6 \ --down_sample_ratio 0.5 \ --drop_path_rate 0.4 \ --pad2square False \ --freeze_llm False \ --freeze_mlp False \ --freeze_backbone True \ --vision_select_layer -1 \ --use_data_resampling False \ --dataloader_num_workers 4 \ --bf16 True \ --num_train_epochs 1 \ --per_device_train_batch_size 1 \ --gradient_accumulation_steps 1 \ --evaluation_strategy "no" \ --save_strategy "steps" \ --save_steps 2000 \ --save_total_limit 20 \ --learning_rate 2e-5 \ --weight_decay 0.05 \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ --max_seq_length 4096 \ --do_train True \ --grad_checkpoint True \ --group_by_length True \ --dynamic_image_size True \ --use_thumbnail True \ --ps_version 'v2' \ --deepspeed "/opt/cv/InternVL/internvl_chat/zero_stage3_config.json" \ --report_to "tensorboard" \ 2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"

1SingleFeng commented 1 month ago

请问你解决了该问题吗，能提供一下相关脚本吗

gangxu822 commented 1 month ago

请问你解决了该问题吗，能提供一下相关脚本吗 NODE_RANK设为0，好了

OpenGVLab / InternVL

finetune时候运行torchrun报这个错 #287

rm -r /opt/cv/grounding_exps/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune_0618/*

bash shell/internlm2_20b_dynamic/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh