Closed Daethyra closed 8 months ago
import os from git import Repo from langchain.document_loaders.generic import GenericLoader from langchain.document_loaders.parsers import LanguageParser from langchain.text_splitter import Language from langchain.embeddings.jina import JinaEmbeddings from langchain.vectorstores.chroma import Chroma def clone_repository(repo_url, repo_path): """ Clones a git repository to the specified path. """ repo = Repo.clone_from(repo_url, to_path=repo_path) return repo def load_code_files(repo_path): """ Loads code files from the specified repository path using LanguageParser. """ loader = GenericLoader.from_filesystem( repo_path, glob="**/*", suffixes=[".py"], parser=LanguageParser(language=Language.PYTHON), ) documents = loader.load() return documents def split_documents(documents): """ Splits the documents into chunks using RecursiveCharacterTextSplitter. """ splitter = RecursiveCharacterTextSplitter.from_language( language=Language.PYTHON, chunk_size=2000, chunk_overlap=200 ) chunks = splitter.split_documents(documents) return chunks def embed_chunks(chunks): """ Embeds the chunks using JinaEmbeddings. """ embeddings = JinaEmbeddings() vectorstore = Chroma.from_documents(chunks, embeddings) return vectorstore def save_vectorstore(vectorstore, chromadb_path): """ Saves the vectorstore to ChromaDB. """ vectorstore.save(chromadb_path) def cleanup_repository(repo_path): """ Cleans up the cloned repository. """ repo = Repo(repo_path) repo.close() os.remove(repo_path) def prepare_vector_db(repo_url, repo_path, chromadb_path): """ Prepares a vector database for similarity searching for RAG over code. """ # Clone the repository clone_repository(repo_url, repo_path) # Load the code files documents = load_code_files(repo_path) # Split the documents into chunks chunks = split_documents(documents) # Embed the chunks vectorstore = embed_chunks(chunks) # Save the vectorstore to ChromaDB save_vectorstore(vectorstore, chromadb_path) # Clean up the cloned repository cleanup_repository(repo_path)
Will release this in a notebook with branch 1.1.1