canonical / bundle-kubeflow

Charmed Kubeflow
Apache License 2.0
104 stars 50 forks source link

`kserve-integration` UAT fails in the CI on AKS and EKS with `Notebook execution failed with KeyError: 'status'` #1100

Open NohaIhab opened 1 month ago

NohaIhab commented 1 month ago

Bug Description

Seen in the scheduled runs in AKS and EKS, the kserve-integration UAT failed. Logs are attached in the Relevant Log Output.

To Reproduce

run the Create AKS cluster, deploy CKF and run bundle test or Create EKS cluster, deploy CKF and run bundle test action in the CI for latest/edge

Environment

AKS 1.29 EKS 1.29 kubeflow bundle latest/edge juju 3.4/stable

Relevant Log Output

=================================== FAILURES ===================================
______________________ test_notebook[kserve-integration] _______________________

test_notebook = '/tests/.worktrees/b9848a5695a361eba1d9b0cfb2fddc99460b304e/tests/notebooks/kserve/kserve-integration.ipynb'

    @pytest.mark.ipynb
    @pytest.mark.parametrize(
        # notebook - ipynb file to execute
        "test_notebook",
        NOTEBOOKS.values(),
        ids=NOTEBOOKS.keys(),
    )
    def test_notebook(test_notebook):
        """Test Notebook Generic Wrapper."""
        os.chdir(os.path.dirname(test_notebook))

        with open(test_notebook) as nb:
            notebook = nbformat.read(nb, as_version=nbformat.NO_CONVERT)

        ep = ExecutePreprocessor(
            timeout=-1, kernel_name="python3", on_notebook_start=install_python_requirements
        )
        ep.skip_cells_with_tag = "pytest-skip"

        try:
            log.info(f"Running ***os.path.basename(test_notebook)***...")
>           output_notebook, _ = ep.preprocess(notebook, ***"metadata": ***"path": "./"***)

/tests/.worktrees/b9848a5695a361eba1d9b0cfb2fddc99460b304e/tests/test_notebooks.py:45: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
/opt/conda/lib/python3.11/site-packages/nbconvert/preprocessors/execute.py:103: in preprocess
    self.preprocess_cell(cell, resources, index)
/opt/conda/lib/python3.11/site-packages/nbconvert/preprocessors/execute.py:124: in preprocess_cell
    cell = self.execute_cell(cell, index, store_history=True)
/opt/conda/lib/python3.11/site-packages/jupyter_core/utils/__init__.py:165: in wrapped
    return loop.run_until_complete(inner)
/opt/conda/lib/python3.11/asyncio/base_events.py:654: in run_until_complete
    return future.result()
/opt/conda/lib/python3.11/site-packages/nbclient/client.py:1062: in async_execute_cell
    await self._check_raise_for_error(cell, cell_index, exec_reply)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = <nbconvert.preprocessors.execute.ExecutePreprocessor object at 0x7fc7cc2e6650>
cell = ***'cell_type': 'code', 'execution_count': 8, 'id': '8522c4e9-07b7-4bff-9b49-3675ff19bacc', 'metadata': ***'execution': ***'...sp = client.get(ISVC_NAME)\nisvc_url = isvc_resp[\'status\'][\'address\'][\'url\']\nprint("Inference URL:", isvc_url)'***
cell_index = 16
exec_reply = ***'buffers': [], 'content': ***'ename': 'KeyError', 'engine_info': ***'engine_id': -1, 'engine_uuid': 'd5e7bdef-d6d9-4ef2-9...e, 'engine': 'd5e7bdef-d6d9-4ef2-9700-5284518715ee', 'started': '2024-10-01T01:07:25.404728Z', 'status': 'error'***, ...***

    async def _check_raise_for_error(
        self, cell: NotebookNode, cell_index: int, exec_reply: dict[str, t.Any] | None
    ) -> None:
        if exec_reply is None:
            return None

        exec_reply_content = exec_reply["content"]
        if exec_reply_content["status"] != "error":
            return None

        cell_allows_errors = (not self.force_raise_errors) and (
            self.allow_errors
            or exec_reply_content.get("ename") in self.allow_error_names
            or "raises-exception" in cell.metadata.get("tags", [])
        )
        await run_hook(
            self.on_cell_error, cell=cell, cell_index=cell_index, execute_reply=exec_reply
        )
        if not cell_allows_errors:
>           raise CellExecutionError.from_cell_and_msg(cell, exec_reply_content)
E           nbclient.exceptions.CellExecutionError: An error occurred while executing the following cell:
E           ------------------
E           isvc_resp = client.get(ISVC_NAME)
E           isvc_url = isvc_resp['status']['address']['url']
E           print("Inference URL:", isvc_url)
E           ------------------
E           
E           
E           ---------------------------------------------------------------------------
E           KeyError                                  Traceback (most recent call last)
E           Cell In[8], line 2
E                 1 isvc_resp = client.get(ISVC_NAME)
E           ----> 2 isvc_url = isvc_resp['status']['address']['url']
E                 3 print("Inference URL:", isvc_url)
E           
E           KeyError: 'status'

/opt/conda/lib/python3.11/site-packages/nbclient/client.py:918: CellExecutionError

During handling of the above exception, another exception occurred:

test_notebook = '/tests/.worktrees/b9848a5695a361eba1d9b0cfb2fddc99460b304e/tests/notebooks/kserve/kserve-integration.ipynb'

    @pytest.mark.ipynb
    @pytest.mark.parametrize(
        # notebook - ipynb file to execute
        "test_notebook",
        NOTEBOOKS.values(),
        ids=NOTEBOOKS.keys(),
    )
    def test_notebook(test_notebook):
        """Test Notebook Generic Wrapper."""
        os.chdir(os.path.dirname(test_notebook))

        with open(test_notebook) as nb:
            notebook = nbformat.read(nb, as_version=nbformat.NO_CONVERT)

        ep = ExecutePreprocessor(
            timeout=-1, kernel_name="python3", on_notebook_start=install_python_requirements
        )
        ep.skip_cells_with_tag = "pytest-skip"

        try:
            log.info(f"Running ***os.path.basename(test_notebook)***...")
            output_notebook, _ = ep.preprocess(notebook, ***"metadata": ***"path": "./"***)
            # persist the notebook output to the original file for debugging purposes
            save_notebook(output_notebook, test_notebook)
        except CellExecutionError as e:
            # handle underlying error
>           pytest.fail(f"Notebook execution failed with ***e.ename***: ***e.evalue***")
E           Failed: Notebook execution failed with KeyError: 'status'

/tests/.worktrees/b9848a5695a361eba1d9b0cfb2fddc99460b304e/tests/test_notebooks.py:50: Failed

Additional Context

No response

syncronize-issues-to-jira[bot] commented 1 month ago

Thank you for reporting us your feedback!

The internal ticket has been created: https://warthogs.atlassian.net/browse/KF-6348.

This message was autogenerated

NohaIhab commented 1 month ago

It's possible that the ISVC does not have a status yet when it is checked, we can add a wait there to make sure that it has a status before getting the URL