OpenMOSS / CoLLiE

Collaborative Training of Large Language Models in an Efficient Way
https://openlmlab-collie.readthedocs.io
Apache License 2.0
405 stars 58 forks source link

RendezvousConnectionError,跑着跑着就有这个报错 #141

Closed 459737087 closed 9 months ago

459737087 commented 9 months ago

启动脚本

2.1 加载配置

config = CollieConfig.from_pretrained(pretrained_model, trust_remote_code=True)
config.tp_size = 8
config.dp_size = 1
config.pp_size = 1
config.train_epochs = 1
config.eval_per_n_steps = 0
config.eval_per_n_epochs = 1
config.train_micro_batch_size = 4
config.gradient_accumulation_steps = 1
config.eval_batch_size = 1
config.use_flash = True
config.ds_config = {
    "fp16": {
        "enabled": True
    },
    "zero_allow_untested_optimizer": True,
    "zero_force_ds_cpu_optimizer": True,
    # 开启ZeRO-3
    "zero_optimization": {
        "stage": 3,
        "overlap_comm": True,
        "contiguous_gradients": True,
        "sub_group_size": 1e8,
        "stage3_max_live_parameters": 1e8,
        "stage3_max_reuse_distance": 1e8,
        "stage3_gather_16bit_weights_on_model_save": True
    },
    "monitor_config": {
        "enabled": True,
        "tag": job_name,
        "csv_monitor": {
            "enabled": True,
            "output_path": "./ds_logs/"
        },
        "tensorboard":{
            "enabled": True,
            "output_path": "./ds_logs/",
            "job_name": job_name
        }
    }
}

# 3. 设置tokenizer
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer, trust_remote_code=True, legacy=False)

tokenizer.pad_token_id = 0

# 4. 加载数据集
train_dataset = load_dataset('json', data_files=data_files, split='train')
train_dataset = CollieDatasetForTraining(train_dataset, tokenizer)
# eval_dataset = load_dataset('json', data_files='/data/data/val_1.json', split='train')
eval_dataset = CollieDatasetForTraining(train_dataset, tokenizer)

# 5. 加载预训练模型
model = LlamaForCausalLM.from_pretrained(pretrained_model, config=config)
model.resize_token_embeddings(len(tokenizer))
# 6. 设置优化器
optimizer = Lomo(
    model,
    lr = 0.001,
    clip_grad_norm = 5.0,
    loss_scale_args = {
        'init_scale': 1024
    }
)

# 7. 添加监视器
monitors = [
    StepTimeMonitor(config),
    TGSMonitor(config),
    MemoryMonitor(config),
    LossMonitor(config),
    EvalMonitor(config)
]

# 8. 添加Evaluator
evaluator_ppl = EvaluatorForPerplexity(
    model = model,
    config = config,
    dataset = eval_dataset,
    monitors = [
        EvalMonitor(config)
    ],
    metrics = {
        'ppl': PPLMetric()
    }
)
evaluator_decode = EvaluatorForGeneration(
    model = model,
    config = config,
    tokenizer = tokenizer,
    dataset = eval_dataset,
    monitors = [
        EvalMonitor(config)
    ],
    metrics = {
        'decode': DecodeMetric()
    },
    generation_config=GenerationConfig(
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        max_new_tokens=1024,
    )
)

callback = CheckpointCallback(
    folder=save_dir,
    model_only=False,
    every_n_batches=512,
    max=3,
    every_n_epochs=1
)

# 9. 实例化trainer
trainer = Trainer(
    model = model,
    tokenizer=tokenizer,
    config = config,
    loss_fn = GPTLMLoss(-100),
    optimizer = optimizer,
    train_dataset = train_dataset,
    monitors = monitors,
    # evaluators = [evaluator_decode],
    callbacks = [callback]
)

trainer.train()
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 113, in _call_store
    return getattr(self._store, store_op)(*args, **kwargs)
RuntimeError: Socket Timeout

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/bin/torchrun", line 33, in <module>
    sys.exit(load_entry_point('torch==2.1.0.dev20230725+cu121', 'console_scripts', 'torchrun')())
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 797, in main
    run(args)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 788, in run
    elastic_launch(
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
    result = agent.run()
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
    result = f(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 736, in run
    result = self._invoke_run(role)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 909, in _invoke_run
    num_nodes_waiting = rdzv_handler.num_nodes_waiting()
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1083, in num_nodes_waiting
    self._state_holder.sync()
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 409, in sync
    get_response = self._backend.get_state()
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 73, in get_state
    base64_state: bytes = self._call_store("get", self._key)
  File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py", line 115, in _call_store
    raise RendezvousConnectionError(
torch.distributed.elastic.rendezvous.api.RendezvousConnectionError: The connection to the C10d store has failed. See inner exception for details.
459737087 commented 9 months ago

Command CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --rdzv_backend=c10d --rdzv_endpoint=localhost:29402 --nnodes=1 --nproc_per_node=4 finetune_moss_for_training.py

KaiLv69 commented 9 months ago

你好,我注意到tp_size=8但是命令中只用了4张GPU,应该使用8张gpu。 BTW, 为了提高吞吐量,建议使用dp_size=8, tp_size=1和zero3

459737087 commented 9 months ago

你好,改成了4也是同样的错 @KaiLv69

KaiLv69 commented 9 months ago

你好,关掉zero3试试呢?代码在两种情况下测试过:tp+不带zero3的dp 或者 dp with zero3 不加 tp

459737087 commented 9 months ago

没成功,我也不知道有什么办法了。。。