Open Z-oo883 opened 1 week ago
@Dosu
To generate questions in Chinese using the TestsetGenerator
, you need to specify the language when adapting the generator. Here's how you can do it:
from ragas.testset import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
# Load and prepare documents
loader = PyPDFLoader("xx.pdf")
documents = loader.load_and_split()
for document in documents:
document.metadata['filename'] = document.metadata['source']
# Initialize models
generator_llm = ChatOpenAI(
model="Qwen2",
temperature=0.3,
openai_api_key="xxx",
openai_api_base='xxx',
stop=['<im_end>']
)
critic_llm = ChatOpenAI(
model="Qwen2",
temperature=0.3,
openai_api_key="xxx",
openai_api_base='xxx',
stop=['<im_end>']
)
embedding_model_name = "\embedding\\bge-large-zh-v1.5"
embedding_model_kwargs = {'device': 'cpu'}
embedding_encode_kwargs = {'batch_size': 32, 'normalize_embeddings': True}
embed_model = HuggingFaceEmbeddings(
model_name=embedding_model_name,
model_kwargs=embedding_model_kwargs,
encode_kwargs=embedding_encode_kwargs
)
# Create TestsetGenerator
generator = TestsetGenerator.from_langchain(
generator_llm,
critic_llm,
embed_model
)
# Adapt to Chinese language
language = "chinese"
generator.adapt(language, evolutions=[simple, reasoning, multi_context])
# Generate test set
testset = generator.generate_with_langchain_docs(documents, test_size=1, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
df = testset.to_pandas()
df.to_csv("test_set.csv", index=False, encoding='utf-8_sig')
This code adapts the generator to Chinese before generating the test set.
To continue talking to Dosu, mention @dosu.
To generate questions in Chinese using the
TestsetGenerator
, you need to specify the language when adapting the generator. Here's how you can do it:from ragas.testset import TestsetGenerator from ragas.testset.evolutions import simple, reasoning, multi_context from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain.embeddings import HuggingFaceEmbeddings from langchain.document_loaders import PyPDFLoader # Load and prepare documents loader = PyPDFLoader("xx.pdf") documents = loader.load_and_split() for document in documents: document.metadata['filename'] = document.metadata['source'] # Initialize models generator_llm = ChatOpenAI( model="Qwen2", temperature=0.3, openai_api_key="xxx", openai_api_base='xxx', stop=['<im_end>'] ) critic_llm = ChatOpenAI( model="Qwen2", temperature=0.3, openai_api_key="xxx", openai_api_base='xxx', stop=['<im_end>'] ) embedding_model_name = "\embedding\\bge-large-zh-v1.5" embedding_model_kwargs = {'device': 'cpu'} embedding_encode_kwargs = {'batch_size': 32, 'normalize_embeddings': True} embed_model = HuggingFaceEmbeddings( model_name=embedding_model_name, model_kwargs=embedding_model_kwargs, encode_kwargs=embedding_encode_kwargs ) # Create TestsetGenerator generator = TestsetGenerator.from_langchain( generator_llm, critic_llm, embed_model ) # Adapt to Chinese language language = "chinese" generator.adapt(language, evolutions=[simple, reasoning, multi_context]) # Generate test set testset = generator.generate_with_langchain_docs(documents, test_size=1, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}) df = testset.to_pandas() df.to_csv("test_set.csv", index=False, encoding='utf-8_sig')
This code adapts the generator to Chinese before generating the test set.
To continue talking to Dosu, mention @dosu.
But according to the code you provided, the following error occurred. What should I do?@dosu
Traceback (most recent call last):
File "D:\RA_LLM\pythonProject\generate_test_data.py", line 53, in
Hi @Z-oo883 This is a pending item for the new test set gen and we will add this to our roadmap.
How to set the language type of the generated questions when creating a test set? code:
import nest_asyncio nest_asyncio.apply() from ragas.testset import TestsetGenerator from ragas.testset.evolutions import simple, reasoning, multi_context from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain.embeddings import HuggingFaceEmbeddings from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("xx.pdf") documents = loader.load_and_split() print(documents) for document in documents: document.metadata['filename'] = document.metadata['source']
generator with openai models
generator_llm = ChatOpenAI( model="Qwen2", temperature=0.3, openai_api_key="xxx", openai_api_base='xxx', stop=['<|im_end|>'] ) critic_llm = ChatOpenAI( model="Qwen2", temperature=0.3, openai_api_key="xxx", openai_api_base='xxx', stop=['<|im_end|>'] ) embedding_model_name = "\embedding\bge-large-zh-v1.5" embedding_model_kwargs = {'device': 'cpu'} embedding_encode_kwargs = {'batch_size': 32, 'normalize_embeddings': True}
embed_model = HuggingFaceEmbeddings( model_name=embedding_model_name, model_kwargs=embedding_model_kwargs, encode_kwargs=embedding_encode_kwargs )
generator = TestsetGenerator.from_langchain( generator_llm, critic_llm, embed_model )
generate testset
testset = generator.generate_with_langchain_docs(documents, test_size=1, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}) df = testset.to_pandas() print(testset) df.to_csv("test_set.csv",index=False,encoding='utf-8_sig')