To speed up LLMs' inference and enhance LLM's perceive of key information, compress the prompt and KV-Cache, which achieves up to 20x compression with minimal performance loss.
The llamaindex RAG demo is no longer functioning properly due to significant changes in library calls after updating llamaindex to version 0.10. Could you help me troubleshoot where the problem might be? Thank you.
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.openai_like import OpenAILike
Setup LLMLingua
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.legacy.postprocessor.longllmlingua import *
from llama_index.core import QueryBundle
from llama_index.llms.openai import OpenAI
import os
import openai
original_contexts = "\n\n".join([n.get_content() for n in retrieved_nodes])
compressed_contexts = "\n\n".join([n.get_content() for n in new_retrieved_nodes])
Describe the bug
The llamaindex RAG demo is no longer functioning properly due to significant changes in library calls after updating llamaindex to version 0.10. Could you help me troubleshoot where the problem might be? Thank you.
Steps to reproduce
Install
!pip install llmlingua llama-index llama-index-embeddings-huggingface llama-index-embeddings-instructor llama-index-llms-openai llama-index-llms-openai-like llama-index-readers-file pymupdf llama-index-retrievers-bm25 transformers llama_hub
!wget "https://www.dropbox.com/s/f6bmb19xdg0xedm/paul_graham_essay.txt?dl=1" -O paul_graham_essay.txt
import
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.core import Settings from llama_index.llms.openai_like import OpenAILike
Setup LLMLingua
from llama_index.core.query_engine import RetrieverQueryEngine from llama_index.core.response_synthesizers import CompactAndRefine from llama_index.legacy.postprocessor.longllmlingua import * from llama_index.core import QueryBundle from llama_index.llms.openai import OpenAI import os import openai
Embedding
load documents
documents = SimpleDirectoryReader(input_files=["paul_graham_essay.txt"]).load_data()
Settings.embed_model = HuggingFaceEmbedding( model_name="BAAI/bge-small-en-v1.5" )
index = VectorStoreIndex.from_documents(documents)
question = "What did the author do growing up?"
question = "What did the author do during his time in YC?"
question = "Where did the author go for art school?"
retriever = index.as_retriever(similarity_top_k=10)
retriever = index.as_retriever(similarity_top_k=10)
Ground-truth Answer
answer = "RISD"
contexts = retriever.retrieve(question)
context_list = [n.get_content() for n in contexts] len(context_list)
llm = OpenAILike(model= "gpt-3.5-turbo", api_base= "https://api.??????.com.cn/v1", api_key= "sk-***", is_chat_model=True) llm2 = OpenAILike(model= "gpt-3.5-turbo-0125", api_base= "https://api.?????.com.cn/v1", api_key= "sk-**", is_chat_model=True)
prompt = "\n\n".join(context_list + [question]) response = llm.complete(prompt) print(str(response))
lingua
node_postprocessor = LongLLMLinguaPostprocessor( instruction_str="Given the context, please answer the final question", target_token=400, rank_method="longllmlingua", additional_compress_kwargs={ "condition_compare": True, "condition_in_question": "after", "context_budget": "+100", "reorder_context": "sort", # enable document reorder, "dynamic_context_compression_ratio": 0.3, }, ) Settings.llm = llm2 retrieved_nodes = retriever.retrieve(question) synthesizer = CompactAndRefine()
outline steps in RetrieverQueryEngine for clarity:
postprocess (compress), synthesize
new_retrieved_nodes = node_postprocessor.postprocess_nodes( retrieved_nodes, query_bundle=QueryBundle(query_str=question) )
original_contexts = "\n\n".join([n.get_content() for n in retrieved_nodes]) compressed_contexts = "\n\n".join([n.get_content() for n in new_retrieved_nodes])
original_tokens = node_postprocessor._llm_lingua.get_token_length(original_contexts) compressed_tokens = node_postprocessor._llm_lingua.get_token_length(compressed_contexts)
print(compressed_contexts) print() print("Original Tokens:", original_tokens) print("Compressed Tokens:", compressed_tokens) print("Compressed Ratio:", f"{original_tokens/(compressed_tokens + 1e-5):.2f}x")
Go wrong here (maybe is the error is because of llamaindex)
response = synthesizer.synthesize(question, new_retrieved_nodes)
retriever_query_engine = RetrieverQueryEngine.from_args( retriever, node_postprocessors=[node_postprocessor] )
response = retriever_query_engine.query(question)
Expected Behavior
According to expectations, the comparison results of the two methods should be output correctly.
Logs
ValidationError Traceback (most recent call last)