run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
36.75k stars 5.27k forks source link

Streaming error in version 0.6.5 #3247

Closed willhamlam closed 1 year ago

willhamlam commented 1 year ago

Following the guide but still got the error below: (both 'gpt-3.5-turbo' and 'text-davinci-003' models) ValueError: LLM must support streaming and set streaming=True.

Here is my code:

from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader('data').load_data()
from llama_index import LLMPredictor, PromptHelper, ServiceContext
from langchain import OpenAI

llm = OpenAI(
  temperature=0, 
  model_name="gpt-3.5-turbo",
  frequency_penalty = 0.5,
  max_tokens=2048,
  streaming=True,
)
llm_predictor = LLMPredictor(llm=llm)

prompt_helper = PromptHelper(
    max_input_size = 4096, 
    num_output = 2048, 
    max_chunk_overlap = 20, 
    chunk_size_limit = 700
)

service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
from llama_index import GPTVectorStoreIndex

index = GPTVectorStoreIndex.from_documents(
    documents, service_context=service_context
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index import ResponseSynthesizer

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    streaming=True,
    similarity_top_k=3,
)
# define custom QuestionAnswerPrompt
from llama_index import QuestionAnswerPrompt

query_str = "XXXX"
QA_PROMPT_TMPL = (
    "We have provided context information below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given all this information, please answer the following questions,"
    "You MUST use the SAME language as the question:\n"
    "{query_str}\n"
)
QA_PROMPT = QuestionAnswerPrompt(QA_PROMPT_TMPL)
response_synthesizer = ResponseSynthesizer.from_args(
    streaming=True,
    text_qa_template=QA_PROMPT
)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)
response_stream = query_engine.query(query_str)
response_stream.print_response_stream()
logan-markewich commented 1 year ago

For gpt-3.5, you need to use the ChatOpenAI class

Which version of llama index do you have? Try upgrading with pip install --upgrade llama-index

You can also simplify your code a bit

index = GPTVectorStoreIndex.from_documents(
    documents, service_context=service_context
)

query_engine = index.as_query_engine(similarity_top_k=3, text_qa_template=QA_PROMPT, streaming=True)

response_stream = query_engine.query(query_str)
response_stream.print_response_stream()
willhamlam commented 1 year ago

Thanks man, I change the code

response_synthesizer = ResponseSynthesizer.from_args(
    streaming=True,
    text_qa_template=QA_PROMPT
)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer
)

Into

query_engine = index.as_query_engine(similarity_top_k=3, text_qa_template=QA_PROMPT, streaming=True)

Now it works!

What is the key point that I am missing?

logan-markewich commented 1 year ago

@willhamlam I need to review the docs/code, but likely the arguments you passed into the response synthesizer likely need to be somewhere else 😅 that's my best guess anyways.

The code that works there handles all that arg passing under the hood, so it can be confusing to trace

vishalp-simplecrm commented 1 year ago

In the same boat getting same error

this is my code

import os import pickle from langchain import OpenAI from llama_index.node_parser import SimpleNodeParser from flask import Flask, render_template, request from google.auth.transport.requests import Request from google_auth_oauthlib.flow import InstalledAppFlow from llama_index import LLMPredictor, PromptHelper, ServiceContext, download_loader,MockLLMPredictor,MockEmbedding,GPTVectorStoreIndex from langchain.chat_models import ChatOpenAI from IPython.core.display import HTML from llama_index import StorageContext, load_index_from_storage

os.environ['OPENAI_API_KEY'] = 'your api key'

def authorize_gdocs(): google_oauth2_scopes = [ "https://www.googleapis.com/auth/drive.readonly", "https://www.googleapis.com/auth/documents.readonly" ] cred = None if os.path.exists("token.pickle"): with open("token.pickle", 'rb') as token: cred = pickle.load(token) if not cred or not cred.valid: if cred and cred.expired and cred.refresh_token: cred.refresh(Request()) else: flow = InstalledAppFlow.from_client_secrets_file("client_secrets.json", google_oauth2_scopes) cred = flow.run_local_server(port=0) with open("token.pickle", 'wb') as token: pickle.dump(cred, token)

authorize_gdocs()

Drive_loader

GoogleDriveReader = download_loader('GoogleDriveReader') folder_id = '1AuhkobVmt0Et0lIrEU0swvwavwXRtJYi' loader = GoogleDriveReader() documents = loader.load_data(folder_id=folder_id)

parsing documents into Nodes

parser = SimpleNodeParser() nodes = parser.get_nodes_from_documents(documents)

llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo",streaming=True))

Define prompt helper max_input_size = 3500 num_output = 500 max_chunk_overlap = 20

prompt_helper = PromptHelper(max_input_size, num_output,max_chunk_overlap)

Define LLM service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper) path = 'C:\Users\Lenovo\Desktop\DocumentBot'

Create index from documents

storage_context = StorageContext.from_defaults(persist_dir=path) index = GPTVectorStoreIndex.from_documents(nodes, service_context=service_context )

persisting index

index.storage_context.persist()

rebuild storage context storage_context = StorageContext.from_defaults(persist_dir=path)

load index index = load_index_from_storage(storage_context) query_engine = index.as_query_engine( streaming=True, similarity_top_k=1 )

query_engine = index.as_query_engine() prompt = input("Ask a question: ") print('prompt given issss:',prompt) response = query_engine.query(prompt) print('Responseeeeee:',response)

logan-markewich commented 1 year ago

@vishalp-simplecrm

Two things

  1. Make sure you pass the service context pack in when loading from storage

  2. To use streaming, you have a few options response_stream.print_response_stream()

OR

gen = response.response_gen
for word in gen:
    ....

OR

all_text = response.get_response()

vishalp-simplecrm commented 1 year ago

Thanks @logan-markewich It works now

But facing an new issue how to render the same on the HTML webpage

As I have tried to display the same using flask app

here is the code for It app = Flask(name)

@app.route('/') def home(): return render_template('index.html')

@app.route('/query', methods=['POST']) def query(): query_engine = index.as_query_engine(similarity_top_k=3, streaming=True) prompt = request.form['prompt'] prompt += " Give in html format" print('prompt given issss:',prompt) response_stream = query_engine.query(prompt) response = response_stream.print_response_stream() return render_template('index.html', prompt=prompt,response = response)

if name == 'main': app.run(debug=True)

Also Related HTML Script 

<body>
<div class="container">
    <h1><Center>Documents Based QA Bot</Center></h1>
    <form id="queryForm" action="/query" method="post">
        <label for="prompt" >Enter a prompt:</label>
        <input type="text" id="prompt" name="prompt">
        <input type="submit" value="Submit">
    </form>
    <div id="result">
        {% if prompt %}
        <h2>Recent Prompt:</h2>
        <p>{{ prompt }}</p>
        {% endif %}
        {% if response %}
        <h2>Response:</h2>
        <p>{{ response|safe }}</p>
        {% endif %}
    </div>
</div>
<div class="loader" id="loader"></div>

<script>
    const form = document.getElementById('queryForm');
    const loader = document.getElementById('loader');
    const resultDiv = document.getElementById('result');

    form.addEventListener('submit', function() {
        loader.style.display = 'block';
        resultDiv.style.display = 'none';
    });

    // If there's a response, hide the loader
    // {% if response %}
    // loader.style.display = 'none';
    // {% endif %}
</script>

That1Panda commented 1 year ago

from langchain.chat_models import ChatOpenAI

worked for me thanks

Disiok commented 1 year ago

@vishalp-simplecrm you need to pass the generator to flask, rather than using print response stream. We will have a tutorial on this soon!

uzumakinaruto19 commented 1 year ago

We will have a tutorial on this soon!

is that tutorial out?