[Bug]: run failed on 8*H20

hitzhu commented 2 weeks ago

Your current environment

The output of `python collect_env.py`

```text Your output of `python collect_env.py` here ```

🐛 Describe the bug

llm = LLM(model=model_path, trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=args.tp, max_model_len=args.max_model_len, enforce_eager=True, disable_custom_all_reduce=True )

Detail: Traceback (most recent call last): File "/checkpoint/binary/train_package/infer.py", line 127, in llm = LLM(model=model_path, File "/root/.local/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 158, in init self.llm_engine = LLMEngine.from_engine_args( File "/root/.local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 445, in from_engine_args engine = cls( File "/root/.local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 249, in init self.model_executor = executor_class( File "/root/.local/lib/python3.10/site-packages/vllm/executor/distributed_gpu_executor.py", line 25, in init super().init(*args, kwargs) File "/root/.local/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 47, in init self._init_executor() File "/root/.local/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py", line 137, in _init_executor self._run_workers("init_device") File "/root/.local/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py", line 192, in _run_workers driver_worker_output = driver_worker_method(*args, *kwargs) File "/root/.local/lib/python3.10/site-packages/vllm/worker/worker.py", line 132, in init_device init_worker_distributed_environment(self.parallel_config, self.rank, File "/root/.local/lib/python3.10/site-packages/vllm/worker/worker.py", line 345, in init_worker_distributed_environment init_distributed_environment(parallel_config.world_size, rank, File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 872, in init_distributed_environment _WORLD = init_world_group(ranks, local_rank, backend) File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 754, in init_world_group return GroupCoordinator( File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 126, in init cpu_group = torch.distributed.new_group(ranks, backend="gloo") File "/root/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 93, in wrapper func_return = func(args, kwargs) File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4125, in new_group return _new_group_with_tag( File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4205, in _new_group_with_tag pg, pg_store = _new_process_group_helper( File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1569, in _new_process_group_helper backend_class = ProcessGroupGloo(backend_prefix_store, group_rank, group_size, timeout=timeout) RuntimeError: [enforce fail at ../third_party/gloo/gloo/transport/tcp/device.cc:83] ifa != nullptr. Unable to find address for: eth0

Before submitting a new issue...

[X] Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.

youkaichao commented 2 weeks ago

please provide detailed environment information

LiaoYuanF commented 2 weeks ago

Check whether your eth0 nic is working

vllm-project / vllm