Open imraviagrawal opened 11 months ago
Is the root cause of this issue known yet? I've seen this come up quite a few times on my end and am not sure how to resolve it.
Yeah I still face this issue, not sure how I can resolve this..... I tried building from source but that also does not help
@amartino1 @imraviagrawal Please paste the full log so that we can help.
Sure @youkaichao
Traceback (most recent call last): File "/usr/local/bin/lm_eval", line 33, in <module> sys.exit(load_entry_point('lm-eval', 'console_scripts', 'lm_eval')()) File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/__main__.py", line 207, in cli_evaluate results = evaluator.simple_evaluate( File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/utils.py", line 402, in _wrapper return fn(*args, **kwargs) File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/evaluator.py", line 102, in simple_evaluate lm = lm_eval.api.registry.get_model(model).create_from_arg_string( File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/api/model.py", line 136, in create_from_arg_string return cls(**args, **args2) File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/models/vllm_causallms.py", line 53, in __init__ self.model = LLM( File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 112, in __init__ self.llm_engine = LLMEngine.from_engine_args( File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 196, in from_engine_args engine = cls( File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 110, in __init__ self.model_executor = executor_class(model_config, cache_config, File "/usr/local/lib/python3.10/dist-packages/vllm/executor/ray_gpu_executor.py", line 65, in __init__ self._init_cache() File "/usr/local/lib/python3.10/dist-packages/vllm/executor/ray_gpu_executor.py", line 220, in _init_cache num_blocks = self._run_workers( File "/usr/local/lib/python3.10/dist-packages/vllm/executor/ray_gpu_executor.py", line 324, in _run_workers driver_worker_output = getattr(self.driver_worker, File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks self.model_runner.profile_run() File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run self.execute_model(seqs, kv_caches) File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model ) = self.prepare_input_tensors(seq_group_metadata_list) File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 611, in prepare_input_tensors broadcast_tensor_dict(metadata_dict, src=0) File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 171, in broadcast_tensor_dict torch.distributed.broadcast_object_list([metadata_list], File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper return func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list broadcast(object_sizes_tensor, src=src, group=group) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper return func(*args, **kwargs) File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast work = default_pg.broadcast([tensor], opts) RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details. (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution. (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] Traceback (most recent call last): (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] return executor(*args, **kwargs) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] self.model_runner.profile_run() (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] self.execute_model(seqs, kv_caches) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list, (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] work = default_pg.broadcast([tensor], opts) (RayWorkerVllm pid=12661) ERROR 04-10 18:29:52 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details. (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution. (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] Traceback (most recent call last): (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] return executor(*args, **kwargs) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] self.model_runner.profile_run() (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] self.execute_model(seqs, kv_caches) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list, (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] work = default_pg.broadcast([tensor], opts) (RayWorkerVllm pid=12773) ERROR 04-10 18:29:52 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details. (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution. (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] Traceback (most recent call last): (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] return executor(*args, **kwargs) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] self.model_runner.profile_run() (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] self.execute_model(seqs, kv_caches) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list, (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] work = default_pg.broadcast([tensor], opts) (RayWorkerVllm pid=13108) ERROR 04-10 18:29:52 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details. (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution. (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] Traceback (most recent call last): (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] return executor(*args, **kwargs) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] self.model_runner.profile_run() (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] self.execute_model(seqs, kv_caches) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list, (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] work = default_pg.broadcast([tensor], opts) (RayWorkerVllm pid=13218) ERROR 04-10 18:29:52 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details. (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution. (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] Traceback (most recent call last): (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] return executor(*args, **kwargs) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] self.model_runner.profile_run() (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] self.execute_model(seqs, kv_caches) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list, (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] work = default_pg.broadcast([tensor], opts) (RayWorkerVllm pid=13328) ERROR 04-10 18:29:52 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details. (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution. (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] Traceback (most recent call last): (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] return executor(*args, **kwargs) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] self.model_runner.profile_run() (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] self.execute_model(seqs, kv_caches) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list, (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] work = default_pg.broadcast([tensor], opts) (RayWorkerVllm pid=13438) ERROR 04-10 18:29:52 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details. (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution. (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] Traceback (most recent call last): (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] return executor(*args, **kwargs) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] self.model_runner.profile_run() (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] self.execute_model(seqs, kv_caches) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list, (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] return func(*args, **kwargs) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] work = default_pg.broadcast([tensor], opts) (RayWorkerVllm pid=13548) ERROR 04-10 18:29:52 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details.
I think this issue should go to lm-eval repo, looks like they set up distributed inference incorrectly.
I saw this issue in my ci job https://buildkite.com/vllm/ci/builds/4249#018ebafc-0ad9-444e-af70-cafa90556dc4 , and can fix it by making sure collective communication call happens in the right group f582b57
(#3904) .
fyi I've seen this error as well, without using lm-eval (i.e., just using vllm).
It is admittedly really hard to replicate. I've had colleagues building identical python environments on the same VM and find it runs perfectly, but when I do it, I see this InplaceError.
The only "fix" I've found consistently works is to build vllm inside a fresh docker container with all of the relevant requirements etc and run it there, but ofc this solution may not be practical depending on your use case.
@youkaichao, I get the same issue when directly running with VLLM, full log pasted below: Its now an issue when running on TP==1
WARNING 04-10 19:57:00 tokenizer.py:104] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.
INFO 04-10 19:57:17 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
INFO 04-10 19:57:17 selector.py:25] Using XFormers backend.
(RayWorkerVllm pid=12997) INFO 04-10 19:57:19 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
(RayWorkerVllm pid=12997) INFO 04-10 19:57:19 selector.py:25] Using XFormers backend.
(RayWorkerVllm pid=13107) INFO 04-10 19:57:19 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
(RayWorkerVllm pid=13107) INFO 04-10 19:57:19 selector.py:25] Using XFormers backend.
(RayWorkerVllm pid=12443) INFO 04-10 19:57:20 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
(RayWorkerVllm pid=12443) INFO 04-10 19:57:20 selector.py:25] Using XFormers backend.
(RayWorkerVllm pid=12777) INFO 04-10 19:57:20 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
(RayWorkerVllm pid=12777) INFO 04-10 19:57:20 selector.py:25] Using XFormers backend.
(RayWorkerVllm pid=12887) INFO 04-10 19:57:20 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
(RayWorkerVllm pid=12887) INFO 04-10 19:57:20 selector.py:25] Using XFormers backend.
(RayWorkerVllm pid=13218) INFO 04-10 19:57:20 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
(RayWorkerVllm pid=13218) INFO 04-10 19:57:20 selector.py:25] Using XFormers backend.
(RayWorkerVllm pid=12329) INFO 04-10 19:57:20 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
(RayWorkerVllm pid=12329) INFO 04-10 19:57:20 selector.py:25] Using XFormers backend.
INFO 04-10 19:57:21 pynccl_utils.py:45] vLLM is using nccl==2.18.1
(RayWorkerVllm pid=12329) INFO 04-10 19:57:21 pynccl_utils.py:45] vLLM is using nccl==2.18.1
(RayWorkerVllm pid=12443) INFO 04-10 19:57:21 pynccl_utils.py:45] vLLM is using nccl==2.18.1
(RayWorkerVllm pid=12777) INFO 04-10 19:57:21 pynccl_utils.py:45] vLLM is using nccl==2.18.1
(RayWorkerVllm pid=12887) INFO 04-10 19:57:21 pynccl_utils.py:45] vLLM is using nccl==2.18.1
(RayWorkerVllm pid=12997) INFO 04-10 19:57:21 pynccl_utils.py:45] vLLM is using nccl==2.18.1
(RayWorkerVllm pid=13107) INFO 04-10 19:57:21 pynccl_utils.py:45] vLLM is using nccl==2.18.1
(RayWorkerVllm pid=13218) INFO 04-10 19:57:21 pynccl_utils.py:45] vLLM is using nccl==2.18.1
INFO 04-10 20:00:50 model_runner.py:104] Loading model weights took 10.8814 GB
(RayWorkerVllm pid=13107) INFO 04-10 20:00:50 model_runner.py:104] Loading model weights took 10.8814 GB
(RayWorkerVllm pid=13218) INFO 04-10 20:00:50 model_runner.py:104] Loading model weights took 10.8814 GB
(RayWorkerVllm pid=12329) INFO 04-10 20:00:50 model_runner.py:104] Loading model weights took 10.8814 GB
(RayWorkerVllm pid=12443) INFO 04-10 20:00:50 model_runner.py:104] Loading model weights took 10.8814 GB
(RayWorkerVllm pid=12777) INFO 04-10 20:00:50 model_runner.py:104] Loading model weights took 10.8814 GB
(RayWorkerVllm pid=12887) INFO 04-10 20:00:50 model_runner.py:104] Loading model weights took 10.8814 GB
(RayWorkerVllm pid=12997) INFO 04-10 20:00:50 model_runner.py:104] Loading model weights took 10.8814 GB
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution.
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] Traceback (most recent call last):
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] return executor(*args, kwargs)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] return func(*args, *kwargs)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] self.model_runner.profile_run()
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] return func(args, kwargs)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] self.execute_model(seqs, kv_caches)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] return func(*args, kwargs)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list,
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] return func(*args, *kwargs)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] return func(args, kwargs)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] work = default_pg.broadcast([tensor], opts)
(RayWorkerVllm pid=12887) ERROR 04-10 20:00:50 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/py
torch/rfcs/pull/17 for more details.
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] Error executing method profile_num_available_blocks. This might cause deadlock in distributed execution.
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] Traceback (most recent call last):
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 37, in execute_method
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] return executor(*args, kwargs)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] return func(*args, *kwargs)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 131, in profile_num_available_blocks
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] self.model_runner.profile_run()
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] return func(args, kwargs)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 742, in profile_run
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] self.execute_model(seqs, kv_caches)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] return func(*args, kwargs)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 644, in execute_model
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] ) = self.prepare_input_tensors(seq_group_metadata_list)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 613, in prepare_input_tensors
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] metadata_dict = broadcast_tensor_dict(src=0)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 180, in broadcast_tensor_dict
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] torch.distributed.broadcast_object_list(recv_metadata_list,
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] return func(*args, *kwargs)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2603, in broadcast_object_list
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] broadcast(object_sizes_tensor, src=src, group=group)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] return func(args, kwargs)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 1906, in broadcast
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] work = default_pg.broadcast([tensor], opts)
(RayWorkerVllm pid=13107) ERROR 04-10 20:00:50 ray_utils.py:44] RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/py
torch/rfcs/pull/17 for more details.
Traceback (most recent call last):
File "/data/users/ravi/experiments/summarization-research/FastChat/fastchat/eval/eval.py", line 435, in
@imraviagrawal can you try to see if the program is cleaned up clearly? e.g. using ps -aux
to see if some processes are still running from last time.
@youkaichao, i submitted a new job for last log, this is running in an cluster inside docker container
I encountered a similar issue, and resolved it by removing the following env setting:
TORCH_DISTRIBUTED_DEBUG = DETAIL
If you're unsure how to modify it, set it to an empty value before running your command.
export TORCH_DISTRIBUTED_DEBUG = ''
run_your_command_here
or simply
TORCH_DISTRIBUTED_DEBUG = '' run_your_command_here
hope this helps.
I was getting the same issue using a multi-node multi-gpu setup having a ray cluster and vllm.
TORCH_DISTRIBUTED_DEBUG = ''
fixed the issue, not sure why.
unset TORCH_DISTRIBUTED_DEBUG
could be a solution
Logs here:
llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='Qwen/Qwen2-7B', speculative_config=None, tokenizer='Qwen/Qwen2-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=Qwen2-7B, use_v2_block_manager=True, enable_prefix_caching=True)
...
engine = engine_cls.from_engine_args(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 471, in from_engine_args
engine = cls(
^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 381, in __init__
self.engine = self._init_engine(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 552, in _init_engine
return engine_class(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 263, in __init__
self._initialize_kv_caches()
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 362, in _initialize_kv_caches
self.model_executor.determine_num_available_blocks())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/executor/distributed_gpu_executor.py", line 38, in determine_num_available_blocks
num_blocks = self._run_workers("determine_num_available_blocks", )
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/executor/multiproc_gpu_executor.py", line 192, in _run_workers
driver_worker_output = driver_worker_method(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/worker/worker.py", line 179, in determine_num_available_blocks
self.model_runner.profile_run()
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 940, in profile_run
self.execute_model(model_input, kv_caches, intermediate_tensors)
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 1363, in execute_model
hidden_or_intermediate_states = model_executable(
^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 360, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 268, in forward
hidden_states = self.embed_tokens(input_ids)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 352, in forward
output = tensor_model_parallel_all_reduce(output_parallel)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/distributed/communication_op.py", line 11, in tensor_model_parallel_all_reduce
return get_tp_group().all_reduce(input_)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/distributed/parallel_state.py", line 293, in all_reduce
torch.distributed.all_reduce(input_, group=self.device_group)
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py", line 2288, in all_reduce
work = group.allreduce([tensor], opts)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details.
Unexpectedly, unset TORCH_DISTRIBUTED_DEBUG
indeed works. What's the logic behind this...?
Logs here:
llm_engine.py:174] Initializing an LLM engine (v0.5.4) with config: model='Qwen/Qwen2-7B', speculative_config=None, tokenizer='Qwen/Qwen2-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=4, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=Qwen2-7B, use_v2_block_manager=True, enable_prefix_caching=True) ... engine = engine_cls.from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 471, in from_engine_args engine = cls( ^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 381, in __init__ self.engine = self._init_engine(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/async_llm_engine.py", line 552, in _init_engine return engine_class(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 263, in __init__ self._initialize_kv_caches() File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/engine/llm_engine.py", line 362, in _initialize_kv_caches self.model_executor.determine_num_available_blocks()) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/executor/distributed_gpu_executor.py", line 38, in determine_num_available_blocks num_blocks = self._run_workers("determine_num_available_blocks", ) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/executor/multiproc_gpu_executor.py", line 192, in _run_workers driver_worker_output = driver_worker_method(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/worker/worker.py", line 179, in determine_num_available_blocks self.model_runner.profile_run() File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 940, in profile_run self.execute_model(model_input, kv_caches, intermediate_tensors) File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/worker/model_runner.py", line 1363, in execute_model hidden_or_intermediate_states = model_executable( ^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 360, in forward hidden_states = self.model(input_ids, positions, kv_caches, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/model_executor/models/qwen2.py", line 268, in forward hidden_states = self.embed_tokens(input_ids) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 352, in forward output = tensor_model_parallel_all_reduce(output_parallel) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/distributed/communication_op.py", line 11, in tensor_model_parallel_all_reduce return get_tp_group().all_reduce(input_) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/vllm/distributed/parallel_state.py", line 293, in all_reduce torch.distributed.all_reduce(input_, group=self.device_group) File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/distributed/c10d_logger.py", line 79, in wrapper return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/root/micromamba/envs/py311/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py", line 2288, in all_reduce work = group.allreduce([tensor], opts) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details.
Unexpectedly,
unset TORCH_DISTRIBUTED_DEBUG
indeed works. What's the logic behind this...?
unset TORCH_DISTRIBUTED_DEBUG fix my case too!
1*8H100 DGX BOX Torch version: 2.1.1 CUDA version: 12.1 VLLM: 0.2.3
The inference works just fine in tensor parallel 1 but when using tp > 1 I am getting this error below:
WARNING 12-07 18:20:11 tokenizer.py:79] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. Traceback (most recent call last): File "/usr/local/bin/lm_eval", line 33, in
sys.exit(load_entry_point('lm-eval', 'console_scripts', 'lm_eval')())
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/main.py", line 207, in cli_evaluate
results = evaluator.simple_evaluate(
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/utils.py", line 402, in _wrapper
return fn(*args, kwargs)
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/evaluator.py", line 102, in simple_evaluate
lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/api/model.py", line 136, in create_from_arg_string
return cls(args, args2)
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/models/vllm_causallms.py", line 53, in init
self.model = LLM(
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/entrypoints/llm.py", line 93, in init
self.llm_engine = LLMEngine.from_engine_args(engine_args)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/engine/llm_engine.py", line 246, in from_engine_args
engine = cls(engine_configs,
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/engine/llm_engine.py", line 112, in init
self._init_cache()
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/engine/llm_engine.py", line 208, in _init_cache
num_blocks = self._run_workers(
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/engine/llm_engine.py", line 750, in _run_workers
self._run_workers_in_batch(workers, method, args, kwargs))
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/engine/llm_engine.py", line 727, in _run_workers_in_batch
all_outputs = ray.get(all_outputs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
return fn(*args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, *kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2563, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::RayWorkerVllm.execute_method() (pid=40620, ip=10.233.75.70, actor_id=922fe635751d218fc62f064a01000000, repr=<vllm.engine.ray_utils.RayWorkerVllm object at 0x7f62bb8d0ca0>)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/engine/ray_utils.py", line 31, in execute_method
return executor(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, kwargs)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/worker/worker.py", line 88, in profile_num_available_blocks
self.model_runner.profile_run()
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, *kwargs)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/worker/model_runner.py", line 321, in profile_run
self.execute_model(seqs, kv_caches)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(args, kwargs)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/worker/model_runner.py", line 279, in execute_model
hidden_states = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, *kwargs)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/model_executor/models/llama.py", line 294, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(args, kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, kwargs)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/model_executor/models/llama.py", line 255, in forward
hidden_states = self.embed_tokens(input_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, *kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(args, kwargs)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/model_executor/layers/vocab_parallel_embedding.py", line 101, in forward
output = tensor_model_parallel_all_reduce(output_parallel)
File "/data/users/ravi/experiments/summarization-research/vllm/vllm/model_executor/parallel_utils/communication_op.py", line 18, in tensor_model_parallel_all_reduce
torch.distributed.allreduce(input,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2050, in all_reduce
work = group.allreduce([tensor], opts)
RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See https://github.com/pytorch/rfcs/pull/17 for more details.