Self Query Retriever with Google Flan T5 models issue

RanElgedawy commented 1 year ago

System Info

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/output_parsers/json.py:32 │ │ in parse_and_check_json_markdown │ │ │ │ 29 │ │ 30 def parse_and_check_json_markdown(text: str, expected_keys: List[str]) -> dict: │ │ 31 │ try: │ │ ❱ 32 │ │ json_obj = parse_json_markdown(text) │ │ 33 │ except json.JSONDecodeError as e: │ │ 34 │ │ raise OutputParserException(f"Got invalid JSON object. Error: {e}") │ │ 35 │ for key in expected_keys: │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/output_parsers/json.py:25 │ │ in parse_json_markdown │ │ │ │ 22 │ json_str = json_str.strip() │ │ 23 │ │ │ 24 │ # Parse the JSON string into a Python dictionary │ │ ❱ 25 │ parsed = json.loads(json_str) │ │ 26 │ │ │ 27 │ return parsed │ │ 28 │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/json/init.py:346 in loads │ │ │ │ 343 │ if (cls is None and object_hook is None and │ │ 344 │ │ │ parse_int is None and parse_float is None and │ │ 345 │ │ │ parse_constant is None and object_pairs_hook is None and not kw): │ │ ❱ 346 │ │ return _default_decoder.decode(s) │ │ 347 │ if cls is None: │ │ 348 │ │ cls = JSONDecoder │ │ 349 │ if object_hook is not None: │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/json/decoder.py:337 in decode │ │ │ │ 334 │ │ containing a JSON document). │ │ 335 │ │ │ │ 336 │ │ """ │ │ ❱ 337 │ │ obj, end = self.raw_decode(s, idx=_w(s, 0).end()) │ │ 338 │ │ end = _w(s, end).end() │ │ 339 │ │ if end != len(s): │ │ 340 │ │ │ raise JSONDecodeError("Extra data", s, end) │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/json/decoder.py:355 in raw_decode │ │ │ │ 352 │ │ try: │ │ 353 │ │ │ obj, end = self.scan_once(s, idx) │ │ 354 │ │ except StopIteration as err: │ │ ❱ 355 │ │ │ raise JSONDecodeError("Expecting value", s, err.value) from None │ │ 356 │ │ return obj, end │ │ 357 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ JSONDecodeError: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/chains/query_constructor/b │ │ ase.py:37 in parse │ │ │ │ 34 │ │ try: │ │ 35 │ │ │ expected_keys = ["query", "filter"] │ │ 36 │ │ │ allowed_keys = ["query", "filter", "limit"] │ │ ❱ 37 │ │ │ parsed = parse_and_check_json_markdown(text, expected_keys) │ │ 38 │ │ │ if len(parsed["query"]) == 0: │ │ 39 │ │ │ │ parsed["query"] = " " │ │ 40 │ │ │ if parsed["filter"] == "NO_FILTER" or not parsed["filter"]: │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/output_parsers/json.py:34 │ │ in parse_and_check_json_markdown │ │ │ │ 31 │ try: │ │ 32 │ │ json_obj = parse_json_markdown(text) │ │ 33 │ except json.JSONDecodeError as e: │ │ ❱ 34 │ │ raise OutputParserException(f"Got invalid JSON object. Error: {e}") │ │ 35 │ for key in expected_keys: │ │ 36 │ │ if key not in json_obj: │ │ 37 │ │ │ raise OutputParserException( │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ OutputParserException: Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)

During handling of the above exception, another exception occurred:

╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/IPython/core/magics/execution.py:132 │ │ 5 in time │ │ │ │ 1322 │ │ else: │ │ 1323 │ │ │ st = clock2() │ │ 1324 │ │ │ try: │ │ ❱ 1325 │ │ │ │ exec(code, glob, local_ns) │ │ 1326 │ │ │ │ out=None │ │ 1327 │ │ │ │ # multi-line %%time case │ │ 1328 │ │ │ │ if expr_val is not None: │ │ in :1 │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/chains/base.py:149 in │ │ call │ │ │ │ 146 │ │ │ ) │ │ 147 │ │ except (KeyboardInterrupt, Exception) as e: │ │ 148 │ │ │ run_manager.on_chain_error(e) │ │ ❱ 149 │ │ │ raise e │ │ 150 │ │ run_manager.on_chain_end(outputs) │ │ 151 │ │ final_outputs: Dict[str, Any] = self.prep_outputs( │ │ 152 │ │ │ inputs, outputs, return_only_outputs │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/chains/base.py:143 in │ │ call │ │ │ │ 140 │ │ ) │ │ 141 │ │ try: │ │ 142 │ │ │ outputs = ( │ │ ❱ 143 │ │ │ │ self._call(inputs, run_manager=run_manager) │ │ 144 │ │ │ │ if new_arg_supported │ │ 145 │ │ │ │ else self._call(inputs) │ │ 146 │ │ │ ) │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/chains/conversational_retr │ │ ieval/base.py:110 in _call │ │ │ │ 107 │ │ │ ) │ │ 108 │ │ else: │ │ 109 │ │ │ new_question = question │ │ ❱ 110 │ │ docs = self._get_docs(new_question, inputs) │ │ 111 │ │ new_inputs = inputs.copy() │ │ 112 │ │ new_inputs["question"] = new_question │ │ 113 │ │ new_inputs["chat_history"] = chat_history_str │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/chains/conversational_retr │ │ ieval/base.py:191 in _get_docs │ │ │ │ 188 │ │ return docs[:num_docs] │ │ 189 │ │ │ 190 │ def _get_docs(self, question: str, inputs: Dict[str, Any]) -> List[Document]: │ │ ❱ 191 │ │ docs = self.retriever.get_relevant_documents(question) │ │ 192 │ │ return self._reduce_tokens_below_limit(docs) │ │ 193 │ │ │ 194 │ async def _aget_docs(self, question: str, inputs: Dict[str, Any]) -> List[Document]: │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/retrievers/self_query/base │ │ .py:96 in get_relevant_documents │ │ │ │ 93 │ │ inputs = self.llm_chain.prep_inputs({"query": query}) │ │ 94 │ │ structured_query = cast( │ │ 95 │ │ │ StructuredQuery, │ │ ❱ 96 │ │ │ self.llm_chain.predict_and_parse(callbacks=callbacks, inputs), │ │ 97 │ │ ) │ │ 98 │ │ if self.verbose: │ │ 99 │ │ │ print(structured_query) │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/chains/llm.py:281 in │ │ predict_and_parse │ │ │ │ 278 │ │ ) │ │ 279 │ │ result = self.predict(callbacks=callbacks, kwargs) │ │ 280 │ │ if self.prompt.output_parser is not None: │ │ ❱ 281 │ │ │ return self.prompt.output_parser.parse(result) │ │ 282 │ │ else: │ │ 283 │ │ │ return result │ │ 284 │ │ │ │ /home/cloud/anaconda3/envs/mir/lib/python3.10/site-packages/langchain/chains/query_constructor/b │ │ ase.py:50 in parse │ │ │ │ 47 │ │ │ │ **{k: v for k, v in parsed.items() if k in allowed_keys} │ │ 48 │ │ │ ) │ │ 49 │ │ except Exception as e: │ │ ❱ 50 │ │ │ raise OutputParserException( │ │ 51 │ │ │ │ f"Parsing text\n{text}\n raised following error:\n{e}" │ │ 52 │ │ │ ) │ │ 53 │ ╰──────────────────────────────────────────────────────────────────────────────────────────────────╯ OutputParserException: Parsing text json "query": "patient medical notes", "ID": "11542052" "old": "" raised following error: Got invalid JSON object. Error: Expecting value: line 1 column 1 (char 0)

Who can help?

@hwchase17 @ag

Information

[ ] The official example notebooks/scripts
[X] My own modified scripts

Related Components

[ ] LLMs/Chat Models
[ ] Embedding Models
[ ] Prompts / Prompt Templates / Prompt Selectors
[X] Output Parsers
[ ] Document Loaders
[X] Vector Stores / Retrievers
[ ] Memory
[ ] Agents / Agent Executors
[ ] Tools / Toolkits
[ ] Chains
[ ] Callbacks/Tracing
[ ] Async

Reproduction

model_id = 'google/flan-t5-xl' tokenizer = AutoTokenizer.from_pretrained(model_id) pipe = pipeline( model=model_id, tokenizer=tokenizer, max_length=2048, temperature=0.1, top_p=0.95, repetition_penalty=1. )

llm = HuggingFacePipeline(pipeline=pipe) document_content_description = "Patient medical notes" metadata_field_info = [ AttributeInfo( name="ID", description="The unique identifier 'ID' of the patient", type="string", ), AttributeInfo( name="source", description="source of the document", type="string", ), ] retriever = SelfQueryRetriever.from_llm( llm, db, document_content_description, metadata_field_info, verbose = True ) memory=ConversationBufferMemory(memory_key="chat_history",output_key='answer') chain = ConversationalRetrievalChain.from_llm( llm = llm, retriever=retriever, memory=memory, get_chat_history=lambda h :h)

Expected behavior

Expected behavior should be something as attached outputofvicuna

dosubot[bot] commented 1 year ago

Answer generated by a 🤖

Answer

I understand that you're encountering a JSONDecodeError and an OutputParserException when using the Self Query Retriever with Google Flan T5 models in the LangChain library. The error seems to occur when the library attempts to parse JSON output from the language model.

The error message "Expecting value: line 1 column 1 (char 0)" typically indicates that the JSON decoder is unable to find any content to parse at the beginning of the string. This could be due to an empty string or a string that doesn't start with a valid JSON character (like '{' for an object or '[' for an array).

In your case, it appears that the output from the language model (LLM) is not in the expected JSON format. The JsonOutputFunctionsParser class in the LangChain library is trying to parse the output from the LLM as JSON, but it's failing because the output is not valid JSON.

The parse_result method in the JsonOutputFunctionsParser class is where the JSON parsing is happening. It's trying to load the function arguments as a JSON object using json.loads(func). If the JSON loading fails, it raises an OutputParserException.

Here's the relevant code from the JsonOutputFunctionsParser class:

class JsonOutputFunctionsParser(OutputFunctionsParser):
    def parse_result(self, result: List[Generation]) -> Any:
        func = super().parse_result(result)
        if self.args_only:
            return json.loads(func)
        func["arguments"] = json.loads(func["arguments"])
        return func

To address this issue, you need to ensure that the output from the LLM is in the correct format. The output should be a string that includes a JSON object. If the output from the LLM is not in this format, the JsonOutputFunctionsParser will not be able to parse it, and you'll get the error you're seeing.

Please note that this is one potential solution based on the information provided. If this doesn't resolve your issue or if your experience contradicts this solution, I recommend checking if you're using the latest version of the LangChain library, as the issue might have been addressed in a more recent update.

If you believe this is a bug that could impact other users and you're comfortable doing so, you're encouraged to make a pull request with a proposed fix. Your contribution would be greatly appreciated by the LangChain community.

Sources

This response is meant to be useful, save you time, and share context. It is not meant to be a precise solution, but rather a starting point for your own research.

Help me be more useful! Please leave a 👍 if this is helpful and 👎 if it is irrelevant.

dosubot[bot] commented 1 year ago

Hi, @RanElgedawy! I'm Dosu, and I'm helping the LangChain team manage our backlog. I wanted to let you know that we are marking this issue as stale.

Based on my understanding, the issue you reported is related to an error in parsing JSON objects when using a self query retriever with Google Flan T5 models in the LangChain library. It seems that the error occurs when trying to parse an invalid JSON object, resulting in an "Expecting value" error. I have provided a detailed response explaining that the error is likely due to the output from the language model not being in the expected JSON format. I suggest ensuring that the output is a string that includes a JSON object. I have also provided relevant code snippets and suggest checking for updates to the LangChain library.

Before we close this issue, I wanted to check with you if it is still relevant to the latest version of the LangChain repository. If it is, please let us know by commenting on the issue. Otherwise, feel free to close the issue yourself or it will be automatically closed in 7 days.

Thank you for your contribution to the LangChain repository!

abdobeid commented 12 months ago

I encountered the same issue using a Flan-T5 model. To troubleshoot the output from the model, I enabled debugging and captured the prompt generated by LangChain. I then tried to generate the output directly from the model. Here is the code:

temp= """Your goal is to structure the user's query to match the request schema provided below.

<< Structured Request Schema >> When responding use a markdown code snippet with a JSON object formatted in the following schema:

{
    \"query\": string \\ text string to compare to document contents
    \"filter\": string \\ logical condition statement for filtering documents
}

The query string should contain only text that is expected to match the contents of documents. Any conditions in the filter should not be mentioned in the query as well.

A logical condition statement is composed of one or more comparison and logical operation statements.

A comparison statement takes the form: comp(attr, val):

comp (eq | ne | gt | gte | lt | lte): comparator
attr (string): name of attribute to apply the comparison to
val (string): is the comparison value

A logical operation statement takes the form op(statement1, statement2, ...):

op (and | or): logical operator
statement1, statement2, ... (comparison statements or logical operation statements): one or more statements to apply the operation to

Make sure that you only use the comparators and logical operators listed above and no others. Make sure that filters only refer to attributes that exist in the data source. Make sure that filters only use the attributed names with its function names if there are functions applied on them. Make sure that filters only use format YYYY-MM-DD when handling timestamp data typed values. Make sure that filters take into account the descriptions of attributes and only make comparisons that are feasible given the type of data being stored. Make sure that filters are only used as needed. If there are no filters that should be applied return \"NO_FILTER\" for the filter value.

<< Example 1. >> Data Source:

{
    \"content\": \"Lyrics of a song\",
    \"attributes\": {
        \"artist\": {
            \"type\": \"string\",
            \"description\": \"Name of the song artist\"
        },
        \"length\": {
            \"type\": \"integer\",
            \"description\": \"Length of the song in seconds\"
        },
        \"genre\": {
            \"type\": \"string\",
            \"description\": \"The song genre, one of \"pop\", \"rock\" or \"rap\"\"
        }
    }
}

User Query: What are songs by Taylor Swift or Katy Perry about teenage romance under 3 minutes long in the dance pop genre

Structured Request:

{
    \"query\": \"teenager love\",
    \"filter\": \"and(or(eq(\\\"artist\\\", \\\"Taylor Swift\\\"), eq(\\\"artist\\\", \\\"Katy Perry\\\")), lt(\\\"length\\\", 180), eq(\\\"genre\\\", \\\"pop\\\"))\"
}

<< Example 2. >> Data Source:

{
    \"content\": \"Lyrics of a song\",
    \"attributes\": {
        \"artist\": {
            \"type\": \"string\",
            \"description\": \"Name of the song artist\"
        },
        \"length\": {
            \"type\": \"integer\",
            \"description\": \"Length of the song in seconds\"
        },
        \"genre\": {
            \"type\": \"string\",
            \"description\": \"The song genre, one of \"pop\", \"rock\" or \"rap\"\"
        }
    }
}

User Query: What are songs that were not published on Spotify

Structured Request:

{
    \"query\": \"\",
    \"filter\": \"NO_FILTER\"
}

<< Example 3. >> Data Source:

{
    \"content\": \"Audit reports\",
    \"attributes\": {
    \"source\": {
        \"description\": \"The report file the chunk is from\",
        \"type\": \"string\"
    },
    \"page\": {
        \"description\": \"The page from the lecture\",
        \"type\": \"integer\"
    }
}
}

User Query: What documents are from page 1?

Structured Request:"

"""

features = tokenizer([temp], return_tensors='pt',max_length=1000) out = base_model.generate(input_ids=features['input_ids'], attention_mask=features['attention_mask'],max_length=1000) print(tokenizer.decode(out[0]))

The output :

To structure the user's query to match the request schema provided, use a markdown code snippet with a JSON object formatted in the following schema: json "query": "string", "attributes": "artist", "type": "string", "description": "name of the song artist", "length": "type": "integer", "description": "Length of the song in seconds" This will match the schema for the user's query.

langchain-ai / langchain