Detail:
Traceback (most recent call last):
File "/checkpoint/binary/train_package/infer.py", line 127, in
llm = LLM(model=model_path,
File "/root/.local/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 158, in init
self.llm_engine = LLMEngine.from_engine_args(
File "/root/.local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 445, in from_engine_args
engine = cls(
File "/root/.local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 249, in init
self.model_executor = executor_class(
File "/root/.local/lib/python3.10/site-packages/vllm/executor/distributed_gpu_executor.py", line 25, in init
super().init(*args, kwargs)
File "/root/.local/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 47, in init
self._init_executor()
File "/root/.local/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py", line 137, in _init_executor
self._run_workers("init_device")
File "/root/.local/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py", line 192, in _run_workers
driver_worker_output = driver_worker_method(*args, *kwargs)
File "/root/.local/lib/python3.10/site-packages/vllm/worker/worker.py", line 132, in init_device
init_worker_distributed_environment(self.parallel_config, self.rank,
File "/root/.local/lib/python3.10/site-packages/vllm/worker/worker.py", line 345, in init_worker_distributed_environment
init_distributed_environment(parallel_config.world_size, rank,
File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 872, in init_distributed_environment
_WORLD = init_world_group(ranks, local_rank, backend)
File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 754, in init_world_group
return GroupCoordinator(
File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 126, in init
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
File "/root/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 93, in wrapper
func_return = func(args, kwargs)
File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4125, in new_group
return _new_group_with_tag(
File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4205, in _new_group_with_tag
pg, pg_store = _new_process_group_helper(
File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1569, in _new_process_group_helper
backend_class = ProcessGroupGloo(backend_prefix_store, group_rank, group_size, timeout=timeout)
RuntimeError: [enforce fail at ../third_party/gloo/gloo/transport/tcp/device.cc:83] ifa != nullptr. Unable to find address for: eth0
Before submitting a new issue...
[X] Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the documentation page, which can answer lots of frequently asked questions.
Your current environment
The output of `python collect_env.py`
```text Your output of `python collect_env.py` here ```🐛 Describe the bug
llm = LLM(model=model_path, trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=args.tp, max_model_len=args.max_model_len, enforce_eager=True, disable_custom_all_reduce=True )
Detail: Traceback (most recent call last): File "/checkpoint/binary/train_package/infer.py", line 127, in
llm = LLM(model=model_path,
File "/root/.local/lib/python3.10/site-packages/vllm/entrypoints/llm.py", line 158, in init
self.llm_engine = LLMEngine.from_engine_args(
File "/root/.local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 445, in from_engine_args
engine = cls(
File "/root/.local/lib/python3.10/site-packages/vllm/engine/llm_engine.py", line 249, in init
self.model_executor = executor_class(
File "/root/.local/lib/python3.10/site-packages/vllm/executor/distributed_gpu_executor.py", line 25, in init
super().init(*args, kwargs)
File "/root/.local/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 47, in init
self._init_executor()
File "/root/.local/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py", line 137, in _init_executor
self._run_workers("init_device")
File "/root/.local/lib/python3.10/site-packages/vllm/executor/multiproc_gpu_executor.py", line 192, in _run_workers
driver_worker_output = driver_worker_method(*args, *kwargs)
File "/root/.local/lib/python3.10/site-packages/vllm/worker/worker.py", line 132, in init_device
init_worker_distributed_environment(self.parallel_config, self.rank,
File "/root/.local/lib/python3.10/site-packages/vllm/worker/worker.py", line 345, in init_worker_distributed_environment
init_distributed_environment(parallel_config.world_size, rank,
File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 872, in init_distributed_environment
_WORLD = init_world_group(ranks, local_rank, backend)
File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 754, in init_world_group
return GroupCoordinator(
File "/root/.local/lib/python3.10/site-packages/vllm/distributed/parallel_state.py", line 126, in init
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
File "/root/.local/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 93, in wrapper
func_return = func(args, kwargs)
File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4125, in new_group
return _new_group_with_tag(
File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 4205, in _new_group_with_tag
pg, pg_store = _new_process_group_helper(
File "/root/.local/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1569, in _new_process_group_helper
backend_class = ProcessGroupGloo(backend_prefix_store, group_rank, group_size, timeout=timeout)
RuntimeError: [enforce fail at ../third_party/gloo/gloo/transport/tcp/device.cc:83] ifa != nullptr. Unable to find address for: eth0
Before submitting a new issue...