Daethyra / Build-RAGAI

Interactive notes (Jupyter Notebooks) for building AI-powered applications
Other
26 stars 3 forks source link

`query_local_docs.py` does not return LLM Response | Add LangSmith tracing v2 #79

Closed Daethyra closed 9 months ago

Daethyra commented 9 months ago
Daethyra commented 9 months ago

Maybe?

import os
import glob
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI as OpenAILLM
from langchain.chains.question_answering import load_qa_chain

def setup_logging():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def custom_retry(max_retries=3, retry_exceptions=(Exception,), initial_delay=1, backoff_factor=2):
    def decorator(func):
        def wrapper(*args, **kwargs):
            attempts, delay = 0, initial_delay
            while attempts < max_retries:
                try:
                    return func(*args, **kwargs)
                except retry_exceptions as e:
                    attempts += 1
                    next_retry = datetime.now() + timedelta(seconds=delay)
                    logging.warning(f"Retry attempt {attempts} for {func.__name__} due to {e}. Next retry at {next_retry}.")
                    if attempts == max_retries:
                        raise
                    time.sleep(delay)
                    delay *= backoff_factor
        return wrapper
    return decorator

class PDFProcessor:
    def __init__(self):
        self._load_env_vars()
        self._initialize_reusable_objects()

    @custom_retry(max_retries=3, retry_exceptions=(ValueError, FileNotFoundError))
    def _load_env_vars(self):
        load_dotenv()
        self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
        if not self.OPENAI_API_KEY:
            raise ValueError("OPENAI_API_KEY is missing. Please set the environment variable.")

    def _initialize_reusable_objects(self):
        self.embeddings = OpenAIEmbeddings(openai_api_key=self.OPENAI_API_KEY)
        self.llm = OpenAILLM(temperature=0.25, openai_api_key=self.OPENAI_API_KEY)

    @staticmethod
    def get_user_query(prompt="Please enter your query: "):
        query = input(prompt)
        if not query:
            raise ValueError("Query should not be empty.")
        return query

    @custom_retry(max_retries=3, retry_exceptions=(FileNotFoundError,))
    def load_pdfs_from_directory(self, directory_path="data/"):
        if not os.path.exists(directory_path):
            raise FileNotFoundError(f"The directory {directory_path} does not exist.")
        pdf_files = glob.glob(f"{directory_path}/*.pdf")
        if not pdf_files:
            raise FileNotFoundError(f"No PDF files found in the directory {directory_path}.")

        with ThreadPoolExecutor() as executor:
            all_texts = list(executor.map(self._load_and_split_document, pdf_files))
        return [chunk for chunks in all_texts for chunk in chunks]

    def _load_and_split_document(self, file_path, chunk_size=2000, chunk_overlap=0):
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"The file {file_path} does not exist.")
        loader = PyPDFLoader(file_path)
        data = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        return text_splitter.split_documents(data)

    def perform_similarity_search(self, docsearch, query):
        if not query:
            raise ValueError("Query should not be empty.")
        return docsearch.similarity_search(query)

if __name__ == "__main__":
    try:
        setup_logging()
        pdf_processor = PDFProcessor()
        texts = pdf_processor.load_pdfs_from_directory()
        num_docs = len(texts)
        logging.info(f"Loaded {num_docs} document(s).")
        docsearch = Chroma.from_documents(texts, pdf_processor.embeddings)
        chain = load_qa_chain(pdf_processor.llm, chain_type="stuff")
        query = pdf_processor.get_user_query()
        result = pdf_processor.perform_similarity_search(docsearch, query)
        for r in result:
            logging.info(chain.run(input_documents=r, question=query))
    except Exception as e:
        logging.error(f"An error occurred: {e}")