[Question]: How to use local model in RAG cli

Hello @812406210! 👋 I'm here to help you with any bugs, questions, or contributions you have. If you need assistance, feel free to ask!

To configure the settings to use a local model with the RAG CLI instead of the default OpenAI LLM, you need to initialize the RagCLI with the local model parameters. Here is the relevant code to configure the settings:

def init_local_rag_cli(
    persist_dir: Optional[str] = None,
    verbose: bool = False,
    llm_model_name: str = "mixtral:8x7b",
    embed_model_name: str = "BAAI/bge-m3",
) -> RagCLI:
    """Init local RAG CLI."""
    docstore = SimpleDocumentStore()
    persist_dir = persist_dir or default_ragcli_persist_dir()
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    chroma_collection = chroma_client.create_collection("default", get_or_create=True)
    vector_store = ChromaVectorStore(
        chroma_collection=chroma_collection, persist_dir=persist_dir
    )
    print("> Chroma collection initialized")
    llm = Ollama(model=llm_model_name, request_timeout=120.0)
    print("> LLM initialized")
    embed_model = HuggingFaceEmbedding(model_name=embed_model_name, pooling="mean")
    print("> Embedding model initialized")

    ingestion_pipeline = IngestionPipeline(
        transformations=[SentenceSplitter(), embed_model],
        vector_store=vector_store,
        docstore=docstore,
        cache=IngestionCache(),
    )

    service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
    retriever = VectorStoreIndex.from_vector_store(
        ingestion_pipeline.vector_store, service_context=service_context
    ).as_retriever(similarity_top_k=8)
    response_synthesizer = CompactAndRefine(
        service_context=service_context, streaming=True, verbose=True
    )
    # define query pipeline
    query_pipeline = QueryPipeline(verbose=verbose)
    query_pipeline.add_modules(
        {
            "input": InputComponent(),
            "retriever": retriever,
            "summarizer": response_synthesizer,
        }
    )
    query_pipeline.add_link("input", "retriever")
    query_pipeline.add_link("retriever", "summarizer", dest_key="nodes")
    query_pipeline.add_link("input", "summarizer", dest_key="query_str")

    return RagCLI(
        ingestion_pipeline=ingestion_pipeline,
        llm=llm,  # optional
        persist_dir=persist_dir,
        query_pipeline=query_pipeline,
        verbose=False,
    )

To use a local model, call this function with the desired local model names for llm_model_name and embed_model_name. For example:

rag_cli_instance = init_local_rag_cli(
    llm_model_name="mixtral:8x7b",
    embed_model_name="your_local_embedding_model_name"
)
rag_cli_instance.cli()

This will configure the RAG CLI to use the specified local models instead of the default OpenAI LLM ^[1]^[2]^[3].

To continue talking to Dosu, mention @dosu.

run-llama / llama_index

[Question]: How to use local model in RAG cli #14503

Question Validation

Question