Daethyra / Build-RAGAI

Interactive notes (Jupyter Notebooks) for building AI-powered applications
Other
26 stars 3 forks source link

Jina embeddings + vector store module #105

Closed Daethyra closed 8 months ago

Daethyra commented 8 months ago

import os
from git import Repo
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.text_splitter import Language
from langchain.embeddings.jina import JinaEmbeddings
from langchain.vectorstores.chroma import Chroma

def clone_repository(repo_url, repo_path):
    """
    Clones a git repository to the specified path.
    """
    repo = Repo.clone_from(repo_url, to_path=repo_path)
    return repo

def load_code_files(repo_path):
    """
    Loads code files from the specified repository path using LanguageParser.
    """
    loader = GenericLoader.from_filesystem(
        repo_path,
        glob="**/*",
        suffixes=[".py"],
        parser=LanguageParser(language=Language.PYTHON),
    )
    documents = loader.load()
    return documents

def split_documents(documents):
    """
    Splits the documents into chunks using RecursiveCharacterTextSplitter.
    """
    splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
    )
    chunks = splitter.split_documents(documents)
    return chunks

def embed_chunks(chunks):
    """
    Embeds the chunks using JinaEmbeddings.
    """
    embeddings = JinaEmbeddings()
    vectorstore = Chroma.from_documents(chunks, embeddings)
    return vectorstore

def save_vectorstore(vectorstore, chromadb_path):
    """
    Saves the vectorstore to ChromaDB.
    """
    vectorstore.save(chromadb_path)

def cleanup_repository(repo_path):
    """
    Cleans up the cloned repository.
    """
    repo = Repo(repo_path)
    repo.close()
    os.remove(repo_path)

def prepare_vector_db(repo_url, repo_path, chromadb_path):
    """
    Prepares a vector database for similarity searching for RAG over code.
    """
    # Clone the repository
    clone_repository(repo_url, repo_path)

    # Load the code files
    documents = load_code_files(repo_path)

    # Split the documents into chunks
    chunks = split_documents(documents)

    # Embed the chunks
    vectorstore = embed_chunks(chunks)

    # Save the vectorstore to ChromaDB
    save_vectorstore(vectorstore, chromadb_path)

    # Clean up the cloned repository
    cleanup_repository(repo_path)
Daethyra commented 8 months ago

Will release this in a notebook with branch 1.1.1