Sinaptik-AI / pandas-ai

Chat with your database (SQL, CSV, pandas, polars, mongodb, noSQL, etc). PandasAI makes data analysis conversational using LLMs (GPT 3.5 / 4, Anthropic, VertexAI) and RAG.
https://pandas-ai.com
Other
12.68k stars 1.23k forks source link

bug on output_type_template.tmpl #1193

Closed colorwlof closed 2 weeks ago

colorwlof commented 3 months ago

System Info

2.0.4

🐛 Describe the bug

type (possible values "string", "number", "dataframe", "plot"). Example: { "type": "string",.........."value": f"The highest salary is {highest_salary}." }

As use Example, sometimes, the "value": instead of "answer". I think use Format better.

And the other bug ,if the ai creat a reslut that not i want , it will flow last result forever. unitl i delete the chace or unable chace.

dosubot[bot] commented 3 months ago

To address the issues in the output_type_template.tmpl file in version 2.0.4:

  1. Incorrect type Field: Ensure that the type field is set to "answer" instead of "value".

  2. Prevent Reuse of Last Result: Add logic to handle unwanted results and prevent the system from reusing the last result indefinitely.

Here is a possible solution:

class CodeExecution(BaseLogicUnit):
    # ... other methods ...

    def execute(self, input: Any, **kwargs) -> Any:
        self.context: PipelineContext = kwargs.get("context")
        self._dfs = self.context.dfs
        self._config = self.context.config
        self._additional_dependencies = self.context.get("additional_dependencies", [])
        self._current_code_executed = self.context.get("current_code_executed")
        self.logger: Logger = kwargs.get("logger")

        retry_count = 0
        code_to_run = input
        result = None
        while retry_count <= self.context.config.max_retries:
            try:
                result = self.execute_code(code_to_run, code_context)
                if self.context.get("output_type") != "" and (
                    output_helper := self.context.get("output_type")
                ):
                    (validation_ok, validation_errors) = OutputValidator.validate(
                        output_helper, result
                    )

                    if not validation_ok:
                        raise InvalidLLMOutputType(validation_errors)

                if not OutputValidator.validate_result(result):
                    raise InvalidOutputValueMismatch(
                        f'Value type {type(result["value"])} must match with type {result["type"]}'
                    )

                # Ensure the type field is set to "answer"
                result["type"] = "answer"

                break

            except Exception as e:
                traceback_errors = traceback.format_exc()
                self.logger.log(f"Failed with error: {traceback_errors}", logging.ERROR)
                if self.on_failure:
                    self.on_failure(code_to_run, traceback_errors)

                if (
                    not self.context.config.use_error_correction_framework
                    or retry_count >= self.context.config.max_retries
                ):
                    raise e

                retry_count += 1

                self.logger.log(
                    f"Failed to execute code retrying with a correction framework "
                    f"[retry number: {retry_count}]",
                    level=logging.WARNING,
                )

                code_to_run = self._retry_run_code(
                    code_to_run, self.context, self.logger, e
                )

        return LogicUnitOutput(
            result,
            True,
            "Code Executed Successfully",
            {"content_type": "response", "value": ResponseSerializer.serialize(result)},
            final_track_output=True,
        )

Additionally, you can use the OutputValidator class to ensure the validation of the type and value fields:

class OutputValidator:
    @staticmethod
    def validate(expected_type, result: dict[str, Any]) -> tuple[bool, Iterable[str]]:
        validation_logs = []
        result_type, result_value = result.get("type"), result.get("value")

        type_ok = OutputValidator.validate_type(result_type, expected_type)
        if not type_ok:
            validation_logs.append(
                f"The result dict contains inappropriate 'type'. "
                f"Expected '{expected_type}', actual '{result_type}'."
            )
        value_ok = OutputValidator.validate_value(result_value, expected_type)
        if not value_ok:
            validation_logs.append(
                f"result value {repr(result_value)} seems to be inappropriate "
                f"for the type '{expected_type}'."
            )

        return all((type_ok, value_ok)), validation_logs

    def validate_type(self, expected_type: str) -> bool:
        return self == expected_type if expected_type else True

    def validate_value(self, expected_type: str) -> bool:
        if not expected_type:
            return True
        elif expected_type == "number":
            return isinstance(self, (int, float))
        elif expected_type == "string":
            return isinstance(self, str)
        elif expected_type == "dataframe":
            return isinstance(self, (pd.DataFrame, pd.Series))
        elif expected_type == "plot":
            if not isinstance(self, (str, dict)):
                return False

            if isinstance(self, dict):
                return True

            path_to_plot_pattern = r"^(\/[\w.-]+)+(/[\w.-]+)*$|^[^\s/]+(/[\w.-]+)*$"
            return bool(re.match(path_to_plot_pattern, self))

    @staticmethod
    def validate_result(result: dict) -> bool:
        if not isinstance(result, dict) or "type" not in result:
            raise InvalidOutputValueMismatch(
                "Result must be in the format of dictionary of type and value"
            )

        if not result["type"]:
            return False

        elif result["type"] == "number":
            return isinstance(result["value"], (int, float, np.int64))
        elif result["type"] == "string":
            return isinstance(result["value"], str)
        elif result["type"] == "dataframe":
            return isinstance(result["value"], (pd.DataFrame, pd.Series))
        elif result["type"] == "plot":
            if "plotly" in repr(type(result["value"])):
                return True

            if not isinstance(result["value"], (str, dict)):
                return False

            if isinstance(result["value"], dict) or (
                isinstance(result["value"], str)
                and "data:image/png;base64" in result["value"]
            ):
                return True

            path_to_plot_pattern = r"^(\/[\w.-]+)+(/[\w.-]+)*$|^[^\s/]+(/[\w.-]+)*$"
            return bool(re.match(path_to_plot_pattern, result["value"]))

This code ensures that the type field is always set to "answer" and includes logic to handle retries and prevent the system from reusing the last result indefinitely [1].

To continue talking to Dosu, mention @dosu.

gventuri commented 3 months ago

@colorwlof I'm sorry, I don't understand the first point. You can disable the cache for the second point.