vllm-project / vllm

A high-throughput and memory-efficient inference and serving engine for LLMs
https://docs.vllm.ai
Apache License 2.0
50.41k stars 8.24k forks source link

[Bug][Failing Test] - Quantization test - quantization/test_cpu_offload.py #18425

Closed markmc closed 1 month ago

markmc commented 1 month ago

Your current environment

Failing on main as of commit 9609327fa4

🐛 Describe the bug

Failing tests:

FAILED quantization/test_cpu_offload.py::test_cpu_offload_gptq - RuntimeError: Server exited unexpectedly.
FAILED quantization/test_cpu_offload.py::test_cpu_offload_awq - RuntimeError: Server exited unexpectedly.
FAILED quantization/test_cpu_offload.py::test_cpu_offload_compressed_tensors - AssertionError: Results for model='nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t' are not the same.
ref_args=[] ref_envs=None
compare_args=['--cpu-offload-gb', '1'] compare_envs=None
ref_result={'test': 'single_completion', 'text': ' ... ... . Today I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)}
compare_result={'test': 'single_completion', 'text': ' ... ... .\n I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)}
Logs ``` quantization/test_torchao.py::test_opt_125m_int4wo_model_loading_with_params[cuda:0] SKIPPED quantization/test_torchao.py::test_opt_125m_int4wo_model_per_module_quant SKIPPED =================================== FAILURES =================================== _ test_load_8bit_bnb_model[meta-llama/Llama-Guard-3-8B-INT8-read pre-quantized llama 8-bit model] _ args = () kwargs = {'description': 'read pre-quantized llama 8-bit model', 'example_prompts': ['vLLM is a high-throughput and memory-effi...odels.\n', ...], 'hf_runner': , 'model_name': 'meta-llama/Llama-Guard-3-8B-INT8', ...} Skipped = , pid = 1736, pgid = 19, _pid = 1736, _exitcode = 256 old_signal_handler = @functools.wraps(f) def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> None: # Make the process the leader of its own process group # to avoid sending SIGTERM to the parent process os.setpgrp() from _pytest.outcomes import Skipped pid = os.fork() print(f"Fork a new process to run a test {pid}") if pid == 0: try: f(*args, **kwargs) except Skipped as e: # convert Skipped to exit code 0 print(str(e)) os._exit(0) except Exception: import traceback traceback.print_exc() os._exit(1) else: os._exit(0) else: pgid = os.getpgid(pid) _pid, _exitcode = os.waitpid(pid, 0) # ignore SIGTERM signal itself old_signal_handler = signal.signal(signal.SIGTERM, signal.SIG_IGN) # kill all child processes os.killpg(pgid, signal.SIGTERM) # restore the signal handler signal.signal(signal.SIGTERM, old_signal_handler) > assert _exitcode == 0, (f"function {f} failed when called with" f" args {args} and kwargs {kwargs}") E AssertionError: function failed when called with args () and kwargs {'hf_runner': , 'vllm_runner': , 'example_prompts': ['vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.\n', 'Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n', 'Compare and contrast artificial intelligence with human intelligence in terms of processing information.\n', 'Describe the basic components of a neural network and how it can be trained.\n', 'Write a short story about a robot that dreams for the first time.\n', 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n', 'Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n', "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'\n"], 'model_name': 'meta-llama/Llama-Guard-3-8B-INT8', 'description': 'read pre-quantized llama 8-bit model'} utils.py:747: AssertionError ____________________________ test_cpu_offload_gptq _____________________________ monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7fc5b4cb2600> @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="gptq_marlin is not supported on this GPU type.") def test_cpu_offload_gptq(monkeypatch): # This quant method is sensitive to dummy weights, so we force real weights monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto') # Test GPTQ Marlin > compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [], ["--cpu-offload-gb", "1"], max_wait_seconds=480) quantization/test_cpu_offload.py:33: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ utils.py:465: in compare_two_settings compare_all_settings( utils.py:529: in compare_all_settings with RemoteOpenAIServer(model, utils.py:133: in __init__ self._wait_for_server(url=self.url_for("health"), _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = def _wait_for_server(self, *, url: str, timeout: float): # run health check start = time.time() while True: try: if requests.get(url).status_code == 200: break except Exception: # this exception can only be raised by requests.get, # which means the server is not ready yet. # the stack trace is not useful, so we suppress it # by using `raise from None`. result = self.proc.poll() if result is not None and result != 0: > raise RuntimeError("Server exited unexpectedly.") from None E RuntimeError: Server exited unexpectedly. utils.py:161: RuntimeError _____________________________ test_cpu_offload_awq _____________________________ monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7fc59e8de900> @pytest.mark.skipif(not is_quant_method_supported("awq_marlin"), reason="awq_marlin is not supported on this GPU type.") def test_cpu_offload_awq(monkeypatch): # This quant method is sensitive to dummy weights, so we force real weights monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto') # Test AWQ Marlin > compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [], ["--cpu-offload-gb", "1"], max_wait_seconds=480) quantization/test_cpu_offload.py:49: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ utils.py:465: in compare_two_settings compare_all_settings( utils.py:529: in compare_all_settings with RemoteOpenAIServer(model, utils.py:133: in __init__ self._wait_for_server(url=self.url_for("health"), _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = def _wait_for_server(self, *, url: str, timeout: float): # run health check start = time.time() while True: try: if requests.get(url).status_code == 200: break except Exception: # this exception can only be raised by requests.get, # which means the server is not ready yet. # the stack trace is not useful, so we suppress it # by using `raise from None`. result = self.proc.poll() if result is not None and result != 0: > raise RuntimeError("Server exited unexpectedly.") from None E RuntimeError: Server exited unexpectedly. utils.py:161: RuntimeError _____________________ test_cpu_offload_compressed_tensors ______________________ monkeypatch = <_pytest.monkeypatch.MonkeyPatch object at 0x7fc5b4b958e0> @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), reason="gptq_marlin is not supported on this GPU type.") def test_cpu_offload_compressed_tensors(monkeypatch): # This quant method is sensitive to dummy weights, so we force real weights monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto') # Test wNa16 compare_two_settings("nm-testing/tinyllama-oneshot-w4a16-channel-v2", [], ["--cpu-offload-gb", "1"], max_wait_seconds=480) # Test w4a16_marlin24 > compare_two_settings("nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t", [], ["--cpu-offload-gb", "1"], max_wait_seconds=480) quantization/test_cpu_offload.py:69: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ utils.py:465: in compare_two_settings compare_all_settings( _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ model = 'nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t' all_args = [[], ['--cpu-offload-gb', '1']], all_envs = [None, None] def compare_all_settings(model: str, all_args: list[list[str]], all_envs: list[Optional[dict[str, str]]], *, method: str = "generate", max_wait_seconds: Optional[float] = None) -> None: """ Launch API server with several different sets of arguments/environments and compare the results of the API calls with the first set of arguments. Args: model: The model to test. all_args: A list of argument lists to pass to the API server. all_envs: A list of environment dictionaries to pass to the API server. """ trust_remote_code = False for args in all_args: if "--trust-remote-code" in args: trust_remote_code = True break tokenizer_mode = "auto" for args in all_args: if "--tokenizer-mode" in args: tokenizer_mode = args[args.index("--tokenizer-mode") + 1] break tokenizer = get_tokenizer( model, trust_remote_code=trust_remote_code, tokenizer_mode=tokenizer_mode, ) can_force_load_format = True for args in all_args: if "--load-format" in args: can_force_load_format = False break prompt = "Hello, my name is" token_ids = tokenizer(prompt).input_ids ref_results: list = [] for i, (args, env) in enumerate(zip(all_args, all_envs)): if can_force_load_format: # we are comparing the results and # usually we don't need real weights. # we force to use dummy weights by default, # and it should work for most of the cases. # if not, we can use VLLM_TEST_FORCE_LOAD_FORMAT # environment variable to force the load format, # e.g. in quantization tests. args = args + ["--load-format", envs.VLLM_TEST_FORCE_LOAD_FORMAT] compare_results: list = [] results = ref_results if i == 0 else compare_results with RemoteOpenAIServer(model, args, env_dict=env, max_wait_seconds=max_wait_seconds) as server: client = server.get_client() # test models list models = client.models.list() models = models.data served_model = models[0] results.append({ "test": "models_list", "id": served_model.id, "root": served_model.root, }) if method == "generate": results += _test_completion(client, model, prompt, token_ids) elif method == "generate_close": results += _test_completion_close(client, model, prompt) elif method == "generate_chat": results += _test_chat(client, model, prompt) elif method == "generate_with_image": results += _test_image_text( client, model, "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png" ) elif method == "encode": results += _test_embeddings(client, model, prompt) else: raise ValueError(f"Unknown method: {method}") if i > 0: # if any setting fails, raise an error early ref_args = all_args[0] ref_envs = all_envs[0] compare_args = all_args[i] compare_envs = all_envs[i] for ref_result, compare_result in zip(ref_results, compare_results): ref_result = copy.deepcopy(ref_result) compare_result = copy.deepcopy(compare_result) if "embedding" in ref_result and method == "encode": sim = F.cosine_similarity( torch.tensor(ref_result["embedding"]), torch.tensor(compare_result["embedding"]), dim=0, ) assert sim >= 0.999, ( f"Embedding for {model=} are not the same.\n" f"cosine_similarity={sim}\n") del ref_result["embedding"] del compare_result["embedding"] > assert ref_result == compare_result, ( f"Results for {model=} are not the same.\n" f"{ref_args=} {ref_envs=}\n" f"{compare_args=} {compare_envs=}\n" f"{ref_result=}\n" f"{compare_result=}\n") E AssertionError: Results for model='nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t' are not the same. E ref_args=[] ref_envs=None E compare_args=['--cpu-offload-gb', '1'] compare_envs=None E ref_result={'test': 'single_completion', 'text': ' ... ... . Today I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)} E compare_result={'test': 'single_completion', 'text': ' ... ... .\n I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)} utils.py:582: AssertionError ... =========================== short test summary info ============================ FAILED quantization/test_bitsandbytes.py::test_load_8bit_bnb_model[meta-llama/Llama-Guard-3-8B-INT8-read pre-quantized llama 8-bit model] - AssertionError: function failed when called with args () and kwargs {'hf_runner': , 'vllm_runner': , 'example_prompts': ['vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.\n', 'Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.\n', 'Compare and contrast artificial intelligence with human intelligence in terms of processing information.\n', 'Describe the basic components of a neural network and how it can be trained.\n', 'Write a short story about a robot that dreams for the first time.\n', 'Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.\n', 'Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.\n', "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'\n"], 'model_name': 'meta-llama/Llama-Guard-3-8B-INT8', 'description': 'read pre-quantized llama 8-bit model'} FAILED quantization/test_cpu_offload.py::test_cpu_offload_gptq - RuntimeError: Server exited unexpectedly. FAILED quantization/test_cpu_offload.py::test_cpu_offload_awq - RuntimeError: Server exited unexpectedly. FAILED quantization/test_cpu_offload.py::test_cpu_offload_compressed_tensors - AssertionError: Results for model='nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t' are not the same. ref_args=[] ref_envs=None compare_args=['--cpu-offload-gb', '1'] compare_envs=None ref_result={'test': 'single_completion', 'text': ' ... ... . Today I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)} compare_result={'test': 'single_completion', 'text': ' ... ... .\n I', 'finish_reason': 'length', 'usage': CompletionUsage(completion_tokens=5, prompt_tokens=6, total_tokens=11, completion_tokens_details=None, prompt_tokens_details=None)} ====== 4 failed, 78 passed, 35 skipped, 49 warnings in 4561.59s (1:16:01) ====== ^^^ +++ 🚨 Error: The command exited with status 1 ^^^ +++ user command error: The plugin docker command hook exited with status 1 ~~~ Running global pre-exit hook $ /etc/buildkite-agent/hooks/pre-exit ~~~ Running plugin docker pre-exit hook $ /var/lib/buildkite-agent/plugins/bk-gpu-1-queue-ci-i-036cff6c74f0af4ae-1/github-com-buildkite-plugins-docker-buildkite-plugin-v5-2-0/hooks/pre-exit
markmc commented 1 month ago

From @DarkLight1337 :

Quantization test is failing on main, starting between #17826 and #18158

17945 is the first PR where it failed. cc @Chen Zhang and @WoosukKwon

markmc commented 1 month ago

From @heheda12345

https://github.com/vllm-project/vllm/pull/18298 A minimal reproduce of the CI failure, built on top of the commit before #17945. Seems that it also triggered some previous bug in quantization like #18147 @mgoin Do you have any idea this time?

russellb commented 1 month ago

I thought I had an idea on this one. I didn't think a request with temperature=1.0 was deterministic, even with seed set, so I tried removing that part of the test, but it didn't fix it. I'm still getting some differences in output.

diff --git a/tests/utils.py b/tests/utils.py
index bf38d7843..09285fdb3 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -196,6 +196,7 @@ def _test_completion(
     model: str,
     prompt: str,
     token_ids: list[int],
+    deterministic: bool = False,
 ):
     results = []

@@ -227,36 +228,37 @@ def _test_completion(
         "usage": completion.usage,
     })

-    # test seeded random sampling
-    completion = client.completions.create(model=model,
-                                           prompt=prompt,
-                                           max_tokens=5,
-                                           seed=33,
-                                           temperature=1.0)
-
-    results.append({
-        "test": "seeded_sampling",
-        "text": completion.choices[0].text,
-        "finish_reason": completion.choices[0].finish_reason,
-        "usage": completion.usage,
-    })
-
-    # test seeded random sampling with multiple prompts
-    completion = client.completions.create(model=model,
-                                           prompt=[prompt, prompt],
-                                           max_tokens=5,
-                                           seed=33,
-                                           temperature=1.0)
+    if not deterministic:
+        # test seeded random sampling
+        completion = client.completions.create(model=model,
+                                               prompt=prompt,
+                                               max_tokens=5,
+                                               seed=33,
+                                               temperature=1.0)
+
+        results.append({
+            "test": "seeded_sampling",
+            "text": completion.choices[0].text,
+            "finish_reason": completion.choices[0].finish_reason,
+            "usage": completion.usage,
+        })

-    results.append({
-        "test":
-        "seeded_sampling",
-        "text": [choice.text for choice in completion.choices],
-        "finish_reason":
-        [choice.finish_reason for choice in completion.choices],
-        "usage":
-        completion.usage,
-    })
+        # test seeded random sampling with multiple prompts
+        completion = client.completions.create(model=model,
+                                               prompt=[prompt, prompt],
+                                               max_tokens=5,
+                                               seed=33,
+                                               temperature=1.0)
+
+        results.append({
+            "test":
+            "seeded_sampling",
+            "text": [choice.text for choice in completion.choices],
+            "finish_reason":
+            [choice.finish_reason for choice in completion.choices],
+            "usage":
+            completion.usage,
+        })

     # test simple list
     batch = client.completions.create(
@@ -543,7 +545,11 @@ def compare_all_settings(model: str,
             })

             if method == "generate":
-                results += _test_completion(client, model, prompt, token_ids)
+                results += _test_completion(client,
+                                            model,
+                                            prompt,
+                                            token_ids,
+                                            deterministic=True)
             elif method == "generate_close":
                 results += _test_completion_close(client, model, prompt)
             elif method == "generate_chat":
DarkLight1337 commented 1 month ago

Resolved by #18459