DrAlzahraniProjects / csusb_fall2024_cse6550_team3

csusb_fall2024_cse6550_team3
4 stars 1 forks source link

Adding a vectorstore and document loader #101

Open Pavankunchala opened 1 week ago

Pavankunchala commented 1 week ago

If you check out my branch , I have added a new script for creating a vector store using Faiss vector store

import os
import logging
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain.vectorstores import FAISS

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to load or create a FAISS vector store and save locally
def load_or_create_vector_store(documents, collection_name, persist_directory):
    embedding_function = GPT4AllEmbeddings()

    # Path to save the FAISS index
    index_file_path = os.path.join(persist_directory, collection_name)

    # Check if the FAISS index exists
    if os.path.exists(index_file_path):
        logger.info(f"Loading existing FAISS vector store from {index_file_path}")
        return FAISS.load_local(index_file_path, embedding_function)

    else:
        logger.info(f"Creating new FAISS vector store in {persist_directory}")
        vector_store = FAISS.from_documents(documents, embedding_function)

        # Save the FAISS index locally
        os.makedirs(persist_directory, exist_ok=True)
        vector_store.save_local(index_file_path)

        return vector_store

if __name__ == "__main__":
    # Replace with actual documents and paths
    documents = []  # Load documents here
    collection_name = "example_collection"
    persist_directory = "./faiss_vectorstore"

    vector_store = load_or_create_vector_store(documents, collection_name, persist_directory)

here is the code if someone wants to take a look and change it

Pavankunchala commented 1 week ago

Similarly I have added the document loader code as well

# load_documents_to_vector_store.py
import os
import logging
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from create_vector_store import load_or_create_vector_store  # Assuming this is in the same directory as script 1

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Function to load and split PDF documents from a directory
def load_documents_from_directory(document_path: str):
    logger.info(f"Loading documents from {document_path}...")
    documents = PyPDFDirectoryLoader(document_path).load_and_split()
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=2048, chunk_overlap=200)
    return text_splitter.split_documents(documents)

if __name__ == "__main__":
    # Path to the directory containing the PDF documents
    document_path = './docs'

    # Load and split documents
    documents = load_documents_from_directory(document_path)

    # Load or create vector store
    collection_name = "example_collection"
    persist_directory = "./faiss_vectorstore"

    # Load or create vector store using the documents loaded
    vector_store = load_or_create_vector_store(documents, collection_name, persist_directory)