Open minglong-huang opened 2 months ago
To resolve the UnboundLocalError: local variable 'text' referenced before assignment
error in your agenerate_text
method, ensure that the variable text
is properly assigned before it is used. The error occurs because text
is not assigned a value if an exception is raised in the try
block.
Here's how you can modify your agenerate_text
method to handle this:
text
before the try
block.text
is assigned a value even if an exception occurs.Here's the updated code:
async def agenerate_text(
self,
prompt: PromptValue,
n: int = 1,
temperature: float = 1e-8,
stop: t.Optional[t.List[str]] = None,
callbacks: Callbacks = [],
) -> LLMResult:
generations = []
llm_output = {}
token_total = 0
content = prompt.to_string()
print(f'running async def agenerate_text')
text = "" # Initialize text to avoid UnboundLocalError
try:
text, history = await asyncio.get_event_loop().run_in_executor(
None, self.base_llm.chat, self.tokenizer, content, []
)
except asyncio.TimeoutError:
print("操作超时,请检查代码或增加超时时间")
except asyncio.CancelledError:
print("任务被取消,请检查代码")
info = traceback.format_exc()
print(f"info ={info}")
except Exception as e:
print(f"发生未知错误:{e}")
info = traceback.format_exc()
print(f"info ={info}")
generations.append([Generation(text=text)])
token_total += len(text)
llm_output['token_total'] = token_total
result = LLMResult(generations=generations, llm_output=llm_output)
return result
This ensures that text
is always initialized, and if an exception occurs, text
will still have a value (an empty string in this case), preventing the UnboundLocalError
[1].
To continue talking to Dosu, mention @dosu.
@dosu my code:
import typing as t
import asyncio
from typing import List
from datasets import load_dataset, load_from_disk
from ragas.metrics import faithfulness, context_recall, context_precision
from ragas.metrics import AnswerRelevancy
from ragas import evaluate
from ragas.llms import BaseRagasLLM
from langchain.schema import LLMResult
from langchain.schema import Generation
from langchain.callbacks.base import Callbacks
from langchain.schema.embeddings import Embeddings
from transformers import AutoModel, AutoTokenizer
from ragas.llms.prompt import PromptValue
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from FlagEmbedding import FlagModel
from FlagEmbedding import BGEM3FlagModel
from ragas.metrics import answer_relevancy
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings
from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
from ragas.embeddings import BaseRagasEmbeddings
import asyncio
import traceback
from datasets import Dataset
from ragas.embeddings import LangchainEmbeddingsWrapper
import torch
from ragas.run_config import RunConfig, add_async_retry, add_retry
from abc import ABC
class MyLLM(BaseRagasLLM):
def __init__(self,llm_path):
self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True , torch_dtype=torch.bfloat16,low_cpu_mem_usage=True)
self.base_llm = self.base_llm
self.base_llm = self.base_llm.to('cuda').eval()
@property
def llm(self):
return self.base_llm
def get_llm_result(self, prompt):
generations = []
llm_output = {}
token_total = 0
content = prompt.to_string()
text, history = self.base_llm.chat(self.tokenizer, content, history=[])
generations.append([Generation(text=text)])
token_total += len(text)
llm_output['token_total'] = token_total
return LLMResult(generations=generations, llm_output=llm_output)
def generate_text(
self,
prompt: PromptValue,
n: int = 1,
temperature: float = 1e-8,
stop: t.Optional[t.List[str]] = None,
callbacks: Callbacks = [],
):
print(f'runing generate_text function...')
result = self.get_llm_result(prompt)
return result
async def agenerate_text(
self,
prompt: PromptValue,
n: int = 1,
temperature: float = 1e-8,
stop: t.Optional[t.List[str]] = None,
callbacks: Callbacks = [],
) -> LLMResult:
generations = []
llm_output = {}
token_total = 0
content = prompt.to_string()
text = None # 初始化text变量
print(f'running async def agenerate_text')
# text, history = self.base_llm.chat(self.tokenizer, content, history=[])
# print(f'*'*15)
# print(("Generated text: %s", text))
try:
text, history = await asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,
content, [])
# text, history = await asyncio.wait_for(
# asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,content,[]),
# timeout=42 # 例如,设置超时时间为60秒
# )
except asyncio.TimeoutError:
print("操作超时,请检查代码或增加超时时间")
except asyncio.CancelledError:
print("任务被取消,请检查代码")
info = traceback.format_exc()
print(f"info ={info}")
except Exception as e:
print(f"发生未知错误:{e}")
if text is not None: # 确保text已经被赋值
generations.append([Generation(text=text)])
token_total += len(text)
llm_output['token_total'] = token_total
result = LLMResult(generations=generations, llm_output=llm_output)
return result
else:
raise ValueError("未能生成文本")
async def generate(
self,
prompt: PromptValue,
n: int = 1,
temperature: t.Optional[float] = None,
stop: t.Optional[t.List[str]] = None,
callbacks: Callbacks = None,
is_async: bool = True,
) -> LLMResult:
if temperature is None:
temperature = 1e-8
if is_async:
return await self.agenerate_text(prompt, n, temperature, stop, callbacks)
else:
return self.generate_text(prompt, n, temperature, stop, callbacks)
# class MyEmbedding(Embeddings):
#
# def __init__(self, path,max_length=8192, batch_size=256):
# self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章:")
# #self.model = BGEM3FlagModel(path, map_location='cuda')
# self.max_length = max_length
# self.batch_size = batch_size
#
# def embed_documents(self, texts: List[str]) -> List[List[float]]:
# return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()
#
# def embed_query(self, text: str) -> List[float]:
# return self.model.encode_queries(text, self.batch_size, self.max_length).tolist()
class TEstEmbedding(Embeddings, ABC):
run_config: RunConfig
def __init__(self,model_path):
self.embed_texts = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
def embed_text(self, text: str) -> List[float]:
embs = self.embed_texts([text])
return embs[0]
def embed_texts(self, texts: List[str]) -> t.List[t.List[float]]:
# loop = asyncio.get_event_loop()
embed_documents_with_retry = add_retry(
self.embed_documents, self.run_config
)
return embed_documents_with_retry(texts)
async def aembed_text(self, text: str, is_async=True) -> List[float]:
embs = await self.embed_texts([text], is_async=True)
return embs[0]
async def aembed_texts(
self, texts: List[str], is_async: bool = True
) -> t.List[t.List[float]]:
if is_async:
aembed_documents_with_retry = add_async_retry(
self.aembed_documents, self.run_config
)
return await aembed_documents_with_retry(texts)
else:
loop = asyncio.get_event_loop()
embed_documents_with_retry = add_retry(
self.embed_documents, self.run_config
)
return await loop.run_in_executor(None, embed_documents_with_retry, texts)
def set_run_config(self, run_config: RunConfig):
self.run_config = run_config
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return self.embed_texts.encode_corpus(texts, self.batch_size, self.max_length).tolist()
def embed_query(self, text: str) -> List[float]:
return self.embed_texts.encode_queries(text, self.batch_size, self.max_length).tolist()
#数据
data_path = "/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa"
amnesty_qa = load_dataset("/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa")
MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'
#MODEL_PATH = '/home/kelvin/nlp/model/LLM/Qwen/Qwen1.5-32B'
#MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'
embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'
#embedding_model = MyEmbedding(embed_model_path)
embedding_model = TEstEmbedding(embed_model_path)
my_llm = MyLLM(MODEL_PATH)
ans_relevancy = AnswerRelevancy()
data_samples = {
'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
# amnesty_qa["eval"],
result = evaluate(
dataset,
metrics=[context_recall, context_precision, ans_relevancy, faithfulness],
llm=my_llm,
embeddings=embedding_model,
is_async = True,
raise_exceptions=True,
)
df = result.to_pandas()
print(df.head())
df.to_csv("result.csv", index=False)
# print(result)
error:
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [125,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [127,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
Evaluating: 0%| | 0/8 [00:12<?, ?it/s]
Exception in thread Thread-3:
Traceback (most recent call last):
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/threading.py", line 1009, in _bootstrap_inner
self.run()
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 75, in run
results = self.loop.run_until_complete(self._aresults())
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/base_events.py", line 641, in run_until_complete
return future.result()
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 63, in _aresults
raise e
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 58, in _aresults
发生未知错误:CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16BF, lda, b, CUDA_R_16BF, ldb, &fbeta, c, CUDA_R_16BF, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
发生未知错误:The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/home/kelvin/.cache/huggingface/modules/transformers_modules/glm-4-9b-chat/modeling_chatglm.py", line 134, in apply_rotary_pos_emb
xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
x_out2 = torch.stack(
~~~~~~~~~~~ <--- HERE
[
xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
发生未知错误:CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16BF, lda, b, CUDA_R_16BF, ldb, &fbeta, c, CUDA_R_16BF, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
发生未知错误:CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
发生未知错误:CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
发生未知错误:CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
发生未知错误:CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
发生未知错误:The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
File "/home/kelvin/.cache/huggingface/modules/transformers_modules/glm-4-9b-chat/modeling_chatglm.py", line 134, in apply_rotary_pos_emb
xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
x_out2 = torch.stack(
~~~~~~~~~~~ <--- HERE
[
xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
r = await future
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/tasks.py", line 575, in _wait_for_one
return f.result() # May raise f.exception().
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 91, in wrapped_callable_async
return counter, await callable(*args, **kwargs)
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/base.py", line 91, in ascore
raise e
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/base.py", line 87, in ascore
score = await self._ascore(row=row, callbacks=group_cm, is_async=is_async)
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/_context_recall.py", line 113, in _ascore
result = await self.llm.generate(
File "/home/kelvin/nlp/graphrag/eval.py", line 126, in generate
return await self.agenerate_text(prompt, n, temperature, stop, callbacks)
File "/home/kelvin/nlp/graphrag/eval.py", line 111, in agenerate_text
raise ValueError("未能生成文本")
ValueError: 未能生成文本
Traceback (most recent call last):
File "/home/kelvin/nlp/graphrag/eval.py", line 220, in <module>
result = evaluate(
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/evaluation.py", line 231, in evaluate
raise e
File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/evaluation.py", line 213, in evaluate
raise ExceptionInRunner()
ragas.exceptions.ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exception=False` incase you want to show only a warning message instead.
Task was destroyed but it is pending!
task: <Task pending name='context_recall-0' coro=<Executor.wrap_callable_with_index.<locals>.wrapped_callable_async() done, defined at /home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py:90> wait_for=<Future cancelled> cb=[as_completed.<locals>._on_completion() at /home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/tasks.py:562]>
Task was destroyed but it is pending!
task: <Task pending name='faithfulness-1' coro=<Executor.wrap_callable_with_index.<locals>.wrapped_callable_async() done, defined at /home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py:90> wait_for=<Future cancelled> cb=[as_completed.<locals>._on_completion() at /home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/tasks.py:562]>
Process finished with exit code 1
To resolve the CUDA errors and the "ValueError: 未能生成文本" error in your agenerate_text
method, you should ensure that the agenerate_text
method is correctly implemented and that the necessary configurations and dependencies are properly set up. Here is the relevant part of the agenerate_text
method from the LangchainLLMWrapper
class:
async def agenerate_text(
self,
prompt: PromptValue,
n: int = 1,
temperature: t.Optional[float] = None,
stop: t.Optional[t.List[str]] = None,
callbacks: Callbacks = None,
) -> LLMResult:
if temperature is None:
temperature = self.get_temperature(n=n)
if is_multiple_completion_supported(self.langchain_llm):
return await self.langchain_llm.agenerate_prompt(
prompts=[prompt],
n=n,
temperature=temperature,
stop=stop,
callbacks=callbacks,
)
else:
result = await self.langchain_llm.agenerate_prompt(
prompts=[prompt] * n,
temperature=temperature,
stop=stop,
callbacks=callbacks,
)
# make LLMResult.generation appear as if it was n_completions
# note that LLMResult.runs is still a list that represents each run
generations = [[g[0] for g in result.generations]]
result.generations = generations
return result
Ensure that:
prompt
parameter is correctly formatted and passed to the method.langchain_llm
object is properly initialized and supports the agenerate_prompt
method.temperature
, stop
, and callbacks
parameters are correctly handled.Additionally, check the following:
langchain_llm
object is compatible with the GPU and CUDA setup.From the SpecificQA
class in the ragas_experimental
module, ensure that the LLM is properly initialized before calling any generation methods. For example:
assert self.llm is not None, "LLM is not initialized"
This assertion is crucial to avoid errors related to uninitialized LLM objects. Also, ensure that the prompts used in the generate_questions
and generate_question
methods are correctly formatted and that the LLM can handle them without issues.
If the issue persists, you may need to provide more specific details about the CUDA errors and the context in which the "ValueError: 未能生成文本" error occurs [1][2].
To continue talking to Dosu, mention @dosu.
My code:
and error: