explodinggradients / ragas

Evaluation framework for your Retrieval Augmented Generation (RAG) pipelines
https://docs.ragas.io
Apache License 2.0
6.47k stars 628 forks source link

ValueError: a cannot be empty unless no samples are taken #1109

Open Rugved2204 opened 1 month ago

Rugved2204 commented 1 month ago

[ ] I have checked the documentation and related resources and couldn't resolve my bug.

Describe the bug ValueError: a cannot be empty unless no samples are taken

Ragas version:0.1.10 Python version:

Code to Reproduce

from ragas.testset.generator import TestsetGenerator from ragas.testset.evolutions import simple, reasoning, multi_context import nest_asyncio from langchain_community.document_loaders import PubMedLoader from langchain.text_splitter import CharacterTextSplitter from ragas.testset.docstore import InMemoryDocumentStore from ragas.testset.extractor import KeyphraseExtractor from langchain.embeddings import HuggingFaceEmbeddings from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline ) from langchain.llms import HuggingFacePipeline import torch

model_name='mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "right"

use_4bit = True

bnb_4bit_compute_dtype = "float16"

bnb_4bit_quant_type = "nf4"

use_nested_quant = False

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_compute_dtype=compute_dtype, bnb_4bit_use_double_quant=use_nested_quant, )

mistral_model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, )

embedding_model_name = 'BAAI/bge-small-en-v1.5' embeddings = HuggingFaceEmbeddings( model_name=embedding_model_name, model_kwargs={'device': 'cuda:0'} )

response_generation_pipeline = pipeline( model=mistral_model, tokenizer=tokenizer, task="text-generation", temperature=0.1, repetition_penalty=1.1, return_full_text=True, max_new_tokens=500, do_sample=True,

) response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

nest_asyncio.apply() loader = PubMedLoader("liver", load_max_docs=10) documents = loader.load() print(len(documents))

generator = TestsetGenerator.from_langchain( generator_llm=response_generation_llm, critic_llm=response_generation_llm, embeddings=embeddings, )

testset = generator.generate_with_langchain_docs(documents, test_size=2, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},is_async = False,raise_exceptions=False)

Error trace File "/home/ubuntu/scp-analyzer/rag-old/test_set.py", line 100, in testset = generator.generate_with_langchain_docs(documents, test_size=2, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},is_async = False,raise_exceptions=False) File "/home/ubuntu/.local/lib/python3.10/site-packages/ragas/testset/generator.py", line 210, in generate_with_langchain_docs return self.generate( File "/home/ubuntu/.local/lib/python3.10/site-packages/ragas/testset/generator.py", line 279, in generate for n in self.docstore.get_random_nodes(k=test_size) File "/home/ubuntu/.local/lib/python3.10/site-packages/ragas/testset/docstore.py", line 329, in get_random_nodes nodes = rng.choice(np.array(self.nodes), size=k, p=prob).tolist() File "numpy/random/_generator.pyx", line 803, in numpy.random._generator.Generator.choice ValueError: a cannot be empty unless no samples are taken

Expected behavior A clear and concise description of what you expected to happen.

Additional context Add any other context about the problem here. @jjmachan @shahules786

dosubot[bot] commented 1 month ago

The error ValueError: a cannot be empty unless no samples are taken occurs because the list of documents loaded from PubMedLoader is empty. Ensure that the documents list is not empty before passing it to the TestsetGenerator. You can add a check to verify that documents are loaded successfully:

from langchain_community.document_loaders import PubMedLoader
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain.llms import HuggingFacePipeline
import torch
import nest_asyncio

model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

embedding_model_name = 'BAAI/bge-small-en-v1.5'
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': 'cuda:0'}
)

response_generation_pipeline = pipeline(
    model=mistral_model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=500,
    do_sample=True,
)

response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

nest_asyncio.apply()
loader = PubMedLoader("liver", load_max_docs=10)
documents = loader.load()

# Check if documents are loaded
if not documents:
    raise ValueError("No documents were loaded. Please check your query or parameters.")

print(len(documents))

generator = TestsetGenerator.from_langchain(
    generator_llm=response_generation_llm,
    critic_llm=response_generation_llm,
    embeddings=embeddings,
)

testset = generator.generate_with_langchain_docs(
    documents, 
    test_size=2, 
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
    is_async=False,
    raise_exceptions=False
)

This will ensure that you do not attempt to generate a test set with an empty list of documents [1][2].

To continue talking to Dosu, mention @dosu.

Rugved2204 commented 1 month ago

the list is not empty @dosu

dosubot[bot] commented 1 month ago

If the list of documents is not empty and you are still encountering the ValueError: a cannot be empty unless no samples are taken error, it might be due to the docstore not being populated correctly. Ensure that the documents are added to the docstore before generating the test set.

Here's an updated version of your code to ensure the docstore is populated:

from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
import nest_asyncio
from langchain_community.document_loaders import PubMedLoader
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from langchain.llms import HuggingFacePipeline
import torch

model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

embedding_model_name = 'BAAI/bge-small-en-v1.5'
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': 'cuda:0'}
)

response_generation_pipeline = pipeline(
    model=mistral_model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.1,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=500,
    do_sample=True,
)

response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

nest_asyncio.apply()
loader = PubMedLoader("liver", load_max_docs=10)
documents = loader.load()

# Check if documents are loaded
if not documents:
    raise ValueError("No documents were loaded. Please check your query or parameters.")

print(len(documents))

generator = TestsetGenerator.from_langchain(
    generator_llm=response_generation_llm,
    critic_llm=response_generation_llm,
    embeddings=embeddings,
)

# Add documents to the docstore
generator.docstore.add_documents(documents)
print(f"Docstore contains {len(generator.docstore.nodes)} nodes")

# Generate the test set
testset = generator.generate_with_langchain_docs(
    documents, 
    test_size=2, 
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25},
    is_async=False,
    raise_exceptions=False
)

This ensures that the documents are added to the docstore before attempting to generate the test set [1].

To continue talking to Dosu, mention @dosu.

samiislam commented 1 month ago

Hi @dosu,

I have the exact same problem and noticed that in KeyphraseExtractor->extract method the extracted keyphrases is a list of keyphrases dicts:

[{'keyphrases': ['Black hole', 'Region of spacetime', 'Strong gravity', 'Light and electromagnetic waves', 'Theory of general relativity']}, {'keyphrases': ['Great Wall of China', 'Ancient fortifications', 'Northern China']}, {'keyphrases': ['New Samsung Washer', 'Installation', 'Use and Care', 'Important Information', 'Manual', 'Operating Instructions', 'Warnings and Important Safety Instructions', 'Common Sense', 'Caution and Care', 'Electricity and Moving Parts', 'Personal Injury', 'Death', 'Property Damage']}]

The first two seems to be based on the examples in the prompt. The last one is from the document that I am feeding it.

On line 53 in ragas.testset.extractor.py:

async def extract(self, node: Node, is_async: bool = True) -> t.List[str]:
    prompt = self.extractor_prompt.format(text=node.page_content)
    results = await self.llm.generate(prompt=prompt, is_async=is_async)
    keyphrases = await json_loader.safe_load(
        results.generations[0][0].text.strip(), llm=self.llm, is_async=is_async
    )
    keyphrases = keyphrases if isinstance(keyphrases, dict) else {}
    logger.debug("topics: %s", keyphrases)
    return keyphrases.get("keyphrases", [])

the keyphrases is set to an empty dict if the json_loader returns a list, as in my case. I am not sure what to do if this is the correct way to handle the valid keyphrases being generated.

This means that the nodes are not being appended on line 264 (which is not being executed) in ragas.testset.docstore.py:

def add_nodes(self, nodes: t.Sequence[Node], show_progress=True):
    assert self.embeddings is not None, "Embeddings must be set"
    assert self.extractor is not None, "Extractor must be set"

    # NOTE: Adds everything in async mode for now.
    nodes_to_embed = {}
    nodes_to_extract = {}

    # get embeddings for the docs
    executor = Executor(
        desc="embedding nodes",
        keep_progress_bar=False,
        raise_exceptions=True,
        run_config=self.run_config,
    )
    result_idx = 0
    for i, n in enumerate(nodes):
        if n.embedding is None:
            nodes_to_embed.update({i: result_idx})
            executor.submit(
                self.embeddings.embed_text,
                n.page_content,
                name=f"embed_node_task[{i}]",
            )
            result_idx += 1

        if not n.keyphrases:
            nodes_to_extract.update({i: result_idx})
            executor.submit(
                self.extractor.extract,
                n,
                name=f"keyphrase-extraction[{i}]",
            )
            result_idx += 1
    results = executor.results()
    if not results:
        raise ExceptionInRunner()

    for i, n in enumerate(nodes):
        if i in nodes_to_embed.keys():
            n.embedding = results[nodes_to_embed[i]]
        if i in nodes_to_extract.keys():
            keyphrases = results[nodes_to_extract[i]]
            n.keyphrases = keyphrases

        if n.embedding is not None and n.keyphrases != []:
            self.nodes.append(n)
            self.node_map[n.doc_id] = n
            assert isinstance(
                n.embedding, (list, np.ndarray)
            ), "Embedding must be list or np.ndarray"
            self.node_embeddings_list.append(n.embedding)

    self.calculate_nodes_docs_similarity()
    self.set_node_relataionships()
jjmachan commented 1 month ago

@shahules786 can you take a look at this?