Closed Daethyra closed 9 months ago
import os
import glob
import logging
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI as OpenAILLM
from langchain.chains.question_answering import load_qa_chain
def setup_logging():
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def custom_retry(max_retries=3, retry_exceptions=(Exception,), initial_delay=1, backoff_factor=2):
def decorator(func):
def wrapper(*args, **kwargs):
attempts, delay = 0, initial_delay
while attempts < max_retries:
try:
return func(*args, **kwargs)
except retry_exceptions as e:
attempts += 1
next_retry = datetime.now() + timedelta(seconds=delay)
logging.warning(f"Retry attempt {attempts} for {func.__name__} due to {e}. Next retry at {next_retry}.")
if attempts == max_retries:
raise
time.sleep(delay)
delay *= backoff_factor
return wrapper
return decorator
class PDFProcessor:
def __init__(self):
self._load_env_vars()
self._initialize_reusable_objects()
@custom_retry(max_retries=3, retry_exceptions=(ValueError, FileNotFoundError))
def _load_env_vars(self):
load_dotenv()
self.OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not self.OPENAI_API_KEY:
raise ValueError("OPENAI_API_KEY is missing. Please set the environment variable.")
def _initialize_reusable_objects(self):
self.embeddings = OpenAIEmbeddings(openai_api_key=self.OPENAI_API_KEY)
self.llm = OpenAILLM(temperature=0.25, openai_api_key=self.OPENAI_API_KEY)
@staticmethod
def get_user_query(prompt="Please enter your query: "):
query = input(prompt)
if not query:
raise ValueError("Query should not be empty.")
return query
@custom_retry(max_retries=3, retry_exceptions=(FileNotFoundError,))
def load_pdfs_from_directory(self, directory_path="data/"):
if not os.path.exists(directory_path):
raise FileNotFoundError(f"The directory {directory_path} does not exist.")
pdf_files = glob.glob(f"{directory_path}/*.pdf")
if not pdf_files:
raise FileNotFoundError(f"No PDF files found in the directory {directory_path}.")
with ThreadPoolExecutor() as executor:
all_texts = list(executor.map(self._load_and_split_document, pdf_files))
return [chunk for chunks in all_texts for chunk in chunks]
def _load_and_split_document(self, file_path, chunk_size=2000, chunk_overlap=0):
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
loader = PyPDFLoader(file_path)
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
return text_splitter.split_documents(data)
def perform_similarity_search(self, docsearch, query):
if not query:
raise ValueError("Query should not be empty.")
return docsearch.similarity_search(query)
if __name__ == "__main__":
try:
setup_logging()
pdf_processor = PDFProcessor()
texts = pdf_processor.load_pdfs_from_directory()
num_docs = len(texts)
logging.info(f"Loaded {num_docs} document(s).")
docsearch = Chroma.from_documents(texts, pdf_processor.embeddings)
chain = load_qa_chain(pdf_processor.llm, chain_type="stuff")
query = pdf_processor.get_user_query()
result = pdf_processor.perform_similarity_search(docsearch, query)
for r in result:
logging.info(chain.run(input_documents=r, question=query))
except Exception as e:
logging.error(f"An error occurred: {e}")
.env