Error running local LLM

minglong-huang commented 2 months ago

My code：

import typing as t
import asyncio
from typing import List
from datasets import load_dataset, load_from_disk
from ragas.metrics import faithfulness, context_recall, context_precision
from ragas.metrics import AnswerRelevancy
from ragas import evaluate
from ragas.llms import BaseRagasLLM
from langchain.schema import LLMResult
from langchain.schema import Generation
from langchain.callbacks.base import Callbacks
from langchain.schema.embeddings import Embeddings
from transformers import AutoModel, AutoTokenizer
from ragas.llms.prompt import PromptValue
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from FlagEmbedding import FlagModel
from FlagEmbedding import BGEM3FlagModel
from ragas.metrics import answer_relevancy
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings
from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
from ragas.embeddings import BaseRagasEmbeddings
import asyncio
import traceback
from datasets import Dataset
from ragas.embeddings import LangchainEmbeddingsWrapper
import torch
from ragas.run_config import RunConfig, add_async_retry, add_retry
from abc import ABC

class MyLLM(BaseRagasLLM):

    def __init__(self,llm_path):
        self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = self.base_llm
        self.base_llm = self.base_llm.to('cuda').eval()

    @property
    def llm(self):
        return self.base_llm

    def get_llm_result(self, prompt):
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()

        text, history = self.base_llm.chat(self.tokenizer, content, history=[])
        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        return LLMResult(generations=generations, llm_output=llm_output)

    def generate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = [],
    ):
        print(f'runing generate_text function...')
        result = self.get_llm_result(prompt)
        return result

    async def agenerate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = [],
    ) -> LLMResult:
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        print(f'running async def agenerate_text')
        # text, history = self.base_llm.chat(self.tokenizer, content, history=[])
        # print(f'*'*15)
        # print(("Generated text: %s", text))
        try:
            text, history = await asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,
                                                                       content, [])

            # text, history = await asyncio.wait_for(
            #     asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,content,[]),
            #     timeout=42  # 例如，设置超时时间为60秒
            # )

        except asyncio.TimeoutError:
            print("操作超时，请检查代码或增加超时时间")
        except asyncio.CancelledError:
            print("任务被取消，请检查代码")
            info = traceback.format_exc()
            print(f"info ={info}")
        except Exception as e:
            print(f"发生未知错误：{e}")

        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        result = LLMResult(generations=generations, llm_output=llm_output)
        return result

    async def generate(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: t.Optional[float] = None,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = None,
            is_async: bool = True,
    ) -> LLMResult:
        if temperature is None:
            temperature = 1e-8
        if is_async:
            return await self.agenerate_text(prompt, n, temperature, stop, callbacks)
        else:
            return self.generate_text(prompt, n, temperature, stop, callbacks)

# class MyEmbedding(Embeddings):
#
#     def __init__(self, path,max_length=8192, batch_size=256):
#         self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：")
#         #self.model = BGEM3FlagModel(path, map_location='cuda')
#         self.max_length = max_length
#         self.batch_size = batch_size
#
#     def embed_documents(self, texts: List[str]) -> List[List[float]]:
#         return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()
#
#     def embed_query(self, text: str) -> List[float]:
#         return self.model.encode_queries(text, self.batch_size, self.max_length).tolist()

class TEstEmbedding(Embeddings, ABC):
    run_config: RunConfig
    def __init__(self,model_path):
        self.embed_texts = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    def embed_text(self, text: str) -> List[float]:
        embs = self.embed_texts([text])
        return embs[0]

    def embed_texts(self, texts: List[str]) -> t.List[t.List[float]]:

        # loop = asyncio.get_event_loop()
        embed_documents_with_retry = add_retry(
            self.embed_documents, self.run_config
        )
        return embed_documents_with_retry(texts)

    async def aembed_text(self, text: str, is_async=True) -> List[float]:
        embs = await self.embed_texts([text], is_async=is_async)
        return embs[0]

    async def aembed_texts(
        self, texts: List[str], is_async: bool = True
    ) -> t.List[t.List[float]]:
        if is_async:
            aembed_documents_with_retry = add_async_retry(
                self.aembed_documents, self.run_config
            )
            return await aembed_documents_with_retry(texts)
        else:
            loop = asyncio.get_event_loop()
            embed_documents_with_retry = add_retry(
                self.embed_documents, self.run_config
            )
            return await loop.run_in_executor(None, embed_documents_with_retry, texts)

    def set_run_config(self, run_config: RunConfig):
        self.run_config = run_config

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.embed_texts.encode_corpus(texts, self.batch_size, self.max_length).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.embed_texts.encode_queries(text, self.batch_size, self.max_length).tolist()
#数据
data_path = "/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa"
amnesty_qa = load_dataset("/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa")

# MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'
# MODEL_PATH = '/home/kelvin/nlp/model/LLM/Qwen/Qwen1.5-32B'
MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'

embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'
# embedding_model = MyEmbedding(embed_model_path)
embedding_model = TEstEmbedding(embed_model_path)
my_llm = MyLLM(MODEL_PATH)

ans_relevancy = AnswerRelevancy()

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}

dataset = Dataset.from_dict(data_samples)
# amnesty_qa["eval"],
result = evaluate(
    dataset,
    metrics=[context_recall, context_precision, ans_relevancy, faithfulness],
    llm=my_llm,
    embeddings=embedding_model,
    is_async = True,
    raise_exceptions=True,
)

df = result.to_pandas()
print(df.head())
df.to_csv("result.csv", index=False)

# print(result)

and error：

../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [123,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [124,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [125,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [127,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
Evaluating:   0%|          | 0/8 [00:04<?, ?it/s]
Exception in thread Thread-3:
Traceback (most recent call last):
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 75, in run
    results = self.loop.run_until_complete(self._aresults())
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/base_events.py", line 641, in run_until_complete
    return future.result()
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 63, in _aresults
    raise e
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 58, in _aresults
    r = await future
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/tasks.py", line 575, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/base.py", line 91, in ascore
    raise e
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/base.py", line 87, in ascore
    score = await self._ascore(row=row, callbacks=group_cm, is_async=is_async)
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/_faithfulness.py", line 180, in _ascore
    answer_result = await self.llm.generate(
  File "/home/kelvin/nlp/graphrag/eval.py", line 121, in generate
    return await self.agenerate_text(prompt, n, temperature, stop, callbacks)
  File "/home/kelvin/nlp/graphrag/eval.py", line 103, in agenerate_text
    generations.append([Generation(text=text)])
UnboundLocalError: local variable 'text' referenced before assignment
Traceback (most recent call last):
  File "/home/kelvin/nlp/graphrag/eval.py", line 215, in <module>
    result = evaluate(
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/evaluation.py", line 231, in evaluate
    raise e
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/evaluation.py", line 213, in evaluate
    raise ExceptionInRunner()
ragas.exceptions.ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exception=False` incase you want to show only a warning message instead.

Process finished with exit code 1

dosubot[bot] commented 2 months ago

To resolve the UnboundLocalError: local variable 'text' referenced before assignment error in your agenerate_text method, ensure that the variable text is properly assigned before it is used. The error occurs because text is not assigned a value if an exception is raised in the try block.

Here's how you can modify your agenerate_text method to handle this:

Initialize text before the try block.
Add error handling to ensure text is assigned a value even if an exception occurs.

Here's the updated code:

async def agenerate_text(
        self,
        prompt: PromptValue,
        n: int = 1,
        temperature: float = 1e-8,
        stop: t.Optional[t.List[str]] = None,
        callbacks: Callbacks = [],
) -> LLMResult:
    generations = []
    llm_output = {}
    token_total = 0
    content = prompt.to_string()
    print(f'running async def agenerate_text')

    text = ""  # Initialize text to avoid UnboundLocalError

    try:
        text, history = await asyncio.get_event_loop().run_in_executor(
            None, self.base_llm.chat, self.tokenizer, content, []
        )
    except asyncio.TimeoutError:
        print("操作超时，请检查代码或增加超时时间")
    except asyncio.CancelledError:
        print("任务被取消，请检查代码")
        info = traceback.format_exc()
        print(f"info ={info}")
    except Exception as e:
        print(f"发生未知错误：{e}")
        info = traceback.format_exc()
        print(f"info ={info}")

    generations.append([Generation(text=text)])
    token_total += len(text)
    llm_output['token_total'] = token_total
    result = LLMResult(generations=generations, llm_output=llm_output)
    return result

This ensures that text is always initialized, and if an exception occurs, text will still have a value (an empty string in this case), preventing the UnboundLocalError ^[1].

To continue talking to Dosu, mention @dosu.

minglong-huang commented 2 months ago

@dosu my code:

import typing as t
import asyncio
from typing import List
from datasets import load_dataset, load_from_disk
from ragas.metrics import faithfulness, context_recall, context_precision
from ragas.metrics import AnswerRelevancy
from ragas import evaluate
from ragas.llms import BaseRagasLLM
from langchain.schema import LLMResult
from langchain.schema import Generation
from langchain.callbacks.base import Callbacks
from langchain.schema.embeddings import Embeddings
from transformers import AutoModel, AutoTokenizer
from ragas.llms.prompt import PromptValue
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from FlagEmbedding import FlagModel
from FlagEmbedding import BGEM3FlagModel
from ragas.metrics import answer_relevancy
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings
from ragas.llms import BaseRagasLLM, LangchainLLMWrapper
from ragas.embeddings import BaseRagasEmbeddings
import asyncio
import traceback
from datasets import Dataset
from ragas.embeddings import LangchainEmbeddingsWrapper
import torch
from ragas.run_config import RunConfig, add_async_retry, add_retry
from abc import ABC

class MyLLM(BaseRagasLLM):

    def __init__(self,llm_path):
        self.tokenizer = AutoTokenizer.from_pretrained(llm_path, trust_remote_code=True)
        self.base_llm = AutoModel.from_pretrained(llm_path, trust_remote_code=True , torch_dtype=torch.bfloat16,low_cpu_mem_usage=True)
        self.base_llm = self.base_llm
        self.base_llm = self.base_llm.to('cuda').eval()

    @property
    def llm(self):
        return self.base_llm

    def get_llm_result(self, prompt):
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()

        text, history = self.base_llm.chat(self.tokenizer, content, history=[])
        generations.append([Generation(text=text)])
        token_total += len(text)
        llm_output['token_total'] = token_total
        return LLMResult(generations=generations, llm_output=llm_output)

    def generate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = [],
    ):
        print(f'runing generate_text function...')
        result = self.get_llm_result(prompt)
        return result

    async def agenerate_text(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: float = 1e-8,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = [],
    ) -> LLMResult:
        generations = []
        llm_output = {}
        token_total = 0
        content = prompt.to_string()
        text = None  # 初始化text变量
        print(f'running async def agenerate_text')
        # text, history = self.base_llm.chat(self.tokenizer, content, history=[])
        # print(f'*'*15)
        # print(("Generated text: %s", text))
        try:
            text, history = await asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,
                                                                       content, [])

            # text, history = await asyncio.wait_for(
            #     asyncio.get_event_loop().run_in_executor(None, self.base_llm.chat, self.tokenizer,content,[]),
            #     timeout=42  # 例如，设置超时时间为60秒
            # )

        except asyncio.TimeoutError:
            print("操作超时，请检查代码或增加超时时间")
        except asyncio.CancelledError:
            print("任务被取消，请检查代码")
            info = traceback.format_exc()
            print(f"info ={info}")
        except Exception as e:
            print(f"发生未知错误：{e}")

        if text is not None:  # 确保text已经被赋值
            generations.append([Generation(text=text)])
            token_total += len(text)
            llm_output['token_total'] = token_total
            result = LLMResult(generations=generations, llm_output=llm_output)
            return result
        else:
            raise ValueError("未能生成文本")

    async def generate(
            self,
            prompt: PromptValue,
            n: int = 1,
            temperature: t.Optional[float] = None,
            stop: t.Optional[t.List[str]] = None,
            callbacks: Callbacks = None,
            is_async: bool = True,
    ) -> LLMResult:
        if temperature is None:
            temperature = 1e-8
        if is_async:
            return await self.agenerate_text(prompt, n, temperature, stop, callbacks)
        else:
            return self.generate_text(prompt, n, temperature, stop, callbacks)

# class MyEmbedding(Embeddings):
#
#     def __init__(self, path,max_length=8192, batch_size=256):
#         self.model = FlagModel(path, query_instruction_for_retrieval="为这个句子生成表示以用于检索相关文章：")
#         #self.model = BGEM3FlagModel(path, map_location='cuda')
#         self.max_length = max_length
#         self.batch_size = batch_size
#
#     def embed_documents(self, texts: List[str]) -> List[List[float]]:
#         return self.model.encode_corpus(texts, self.batch_size, self.max_length).tolist()
#
#     def embed_query(self, text: str) -> List[float]:
#         return self.model.encode_queries(text, self.batch_size, self.max_length).tolist()

class TEstEmbedding(Embeddings, ABC):
    run_config: RunConfig
    def __init__(self,model_path):
        self.embed_texts = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

    def embed_text(self, text: str) -> List[float]:
        embs = self.embed_texts([text])
        return embs[0]

    def embed_texts(self, texts: List[str]) -> t.List[t.List[float]]:

        # loop = asyncio.get_event_loop()
        embed_documents_with_retry = add_retry(
            self.embed_documents, self.run_config
        )
        return embed_documents_with_retry(texts)

    async def aembed_text(self, text: str, is_async=True) -> List[float]:
        embs = await self.embed_texts([text], is_async=True)
        return embs[0]

    async def aembed_texts(
        self, texts: List[str], is_async: bool = True
    ) -> t.List[t.List[float]]:
        if is_async:
            aembed_documents_with_retry = add_async_retry(
                self.aembed_documents, self.run_config
            )
            return await aembed_documents_with_retry(texts)
        else:
            loop = asyncio.get_event_loop()
            embed_documents_with_retry = add_retry(
                self.embed_documents, self.run_config
            )
            return await loop.run_in_executor(None, embed_documents_with_retry, texts)

    def set_run_config(self, run_config: RunConfig):
        self.run_config = run_config

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.embed_texts.encode_corpus(texts, self.batch_size, self.max_length).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.embed_texts.encode_queries(text, self.batch_size, self.max_length).tolist()
#数据
data_path = "/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa"
amnesty_qa = load_dataset("/home/kelvin/nlp/graphrag/eval_dataset/amnesty_qa")

MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'
#MODEL_PATH = '/home/kelvin/nlp/model/LLM/Qwen/Qwen1.5-32B'
#MODEL_PATH = '/home/kelvin/nlp/model/LLM/THUDM/glm-4-9b-chat'

embed_model_path = '/home/kelvin/nlp/model/Embedding/BAAI/bge-m3'
#embedding_model = MyEmbedding(embed_model_path)
embedding_model = TEstEmbedding(embed_model_path)
my_llm = MyLLM(MODEL_PATH)

ans_relevancy = AnswerRelevancy()

data_samples = {
    'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
    'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
    'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
    ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
    'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}

dataset = Dataset.from_dict(data_samples)
# amnesty_qa["eval"],
result = evaluate(
    dataset,
    metrics=[context_recall, context_precision, ans_relevancy, faithfulness],
    llm=my_llm,
    embeddings=embedding_model,
    is_async = True,
    raise_exceptions=True,
)

df = result.to_pandas()
print(df.head())
df.to_csv("result.csv", index=False)

# print(result)

error:

../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [125,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [126,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1231: indexSelectSmallIndex: block: [18,0,0], thread: [127,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
Evaluating:   0%|          | 0/8 [00:12<?, ?it/s]
Exception in thread Thread-3:
Traceback (most recent call last):
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/threading.py", line 1009, in _bootstrap_inner
    self.run()
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 75, in run
    results = self.loop.run_until_complete(self._aresults())
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/base_events.py", line 641, in run_until_complete
    return future.result()
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 63, in _aresults
    raise e
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 58, in _aresults
发生未知错误：CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16BF, lda, b, CUDA_R_16BF, ldb, &fbeta, c, CUDA_R_16BF, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
发生未知错误：The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
  File "/home/kelvin/.cache/huggingface/modules/transformers_modules/glm-4-9b-chat/modeling_chatglm.py", line 134, in apply_rotary_pos_emb
    xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
    rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
    x_out2 = torch.stack(
             ~~~~~~~~~~~ <--- HERE
        [
            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

发生未知错误：CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmEx( handle, opa, opb, m, n, k, &falpha, a, CUDA_R_16BF, lda, b, CUDA_R_16BF, ldb, &fbeta, c, CUDA_R_16BF, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`
发生未知错误：CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

发生未知错误：CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

发生未知错误：CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

发生未知错误：CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

发生未知错误：The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
  File "/home/kelvin/.cache/huggingface/modules/transformers_modules/glm-4-9b-chat/modeling_chatglm.py", line 134, in apply_rotary_pos_emb
    xshaped = x.reshape(b, np, sq, rot_dim // 2, 2)
    rope_cache = rope_cache.view(-1, 1, sq, xshaped.size(3), 2)
    x_out2 = torch.stack(
             ~~~~~~~~~~~ <--- HERE
        [
            xshaped[..., 0] * rope_cache[..., 0] - xshaped[..., 1] * rope_cache[..., 1],
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

    r = await future
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/tasks.py", line 575, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py", line 91, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/base.py", line 91, in ascore
    raise e
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/base.py", line 87, in ascore
    score = await self._ascore(row=row, callbacks=group_cm, is_async=is_async)
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/metrics/_context_recall.py", line 113, in _ascore
    result = await self.llm.generate(
  File "/home/kelvin/nlp/graphrag/eval.py", line 126, in generate
    return await self.agenerate_text(prompt, n, temperature, stop, callbacks)
  File "/home/kelvin/nlp/graphrag/eval.py", line 111, in agenerate_text
    raise ValueError("未能生成文本")
ValueError: 未能生成文本
Traceback (most recent call last):
  File "/home/kelvin/nlp/graphrag/eval.py", line 220, in <module>
    result = evaluate(
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/evaluation.py", line 231, in evaluate
    raise e
  File "/home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/evaluation.py", line 213, in evaluate
    raise ExceptionInRunner()
ragas.exceptions.ExceptionInRunner: The runner thread which was running the jobs raised an exeception. Read the traceback above to debug it. You can also pass `raise_exception=False` incase you want to show only a warning message instead.
Task was destroyed but it is pending!
task: <Task pending name='context_recall-0' coro=<Executor.wrap_callable_with_index.<locals>.wrapped_callable_async() done, defined at /home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py:90> wait_for=<Future cancelled> cb=[as_completed.<locals>._on_completion() at /home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/tasks.py:562]>
Task was destroyed but it is pending!
task: <Task pending name='faithfulness-1' coro=<Executor.wrap_callable_with_index.<locals>.wrapped_callable_async() done, defined at /home/kelvin/anaconda3/envs/nlp/lib/python3.10/site-packages/ragas/executor.py:90> wait_for=<Future cancelled> cb=[as_completed.<locals>._on_completion() at /home/kelvin/anaconda3/envs/nlp/lib/python3.10/asyncio/tasks.py:562]>

Process finished with exit code 1

dosubot[bot] commented 2 months ago

To resolve the CUDA errors and the "ValueError: 未能生成文本" error in your agenerate_text method, you should ensure that the agenerate_text method is correctly implemented and that the necessary configurations and dependencies are properly set up. Here is the relevant part of the agenerate_text method from the LangchainLLMWrapper class:

async def agenerate_text(
    self,
    prompt: PromptValue,
    n: int = 1,
    temperature: t.Optional[float] = None,
    stop: t.Optional[t.List[str]] = None,
    callbacks: Callbacks = None,
) -> LLMResult:
    if temperature is None:
        temperature = self.get_temperature(n=n)

    if is_multiple_completion_supported(self.langchain_llm):
        return await self.langchain_llm.agenerate_prompt(
            prompts=[prompt],
            n=n,
            temperature=temperature,
            stop=stop,
            callbacks=callbacks,
        )
    else:
        result = await self.langchain_llm.agenerate_prompt(
            prompts=[prompt] * n,
            temperature=temperature,
            stop=stop,
            callbacks=callbacks,
        )
        # make LLMResult.generation appear as if it was n_completions
        # note that LLMResult.runs is still a list that represents each run
        generations = [[g[0] for g in result.generations]]
        result.generations = generations
        return result

Ensure that:

The prompt parameter is correctly formatted and passed to the method.
The langchain_llm object is properly initialized and supports the agenerate_prompt method.
The temperature, stop, and callbacks parameters are correctly handled.

Additionally, check the following:

Ensure that your environment has the necessary CUDA dependencies installed and configured correctly.
Verify that the langchain_llm object is compatible with the GPU and CUDA setup.

From the SpecificQA class in the ragas_experimental module, ensure that the LLM is properly initialized before calling any generation methods. For example:

assert self.llm is not None, "LLM is not initialized"

This assertion is crucial to avoid errors related to uninitialized LLM objects. Also, ensure that the prompts used in the generate_questions and generate_question methods are correctly formatted and that the LLM can handle them without issues.

If the issue persists, you may need to provide more specific details about the CUDA errors and the context in which the "ValueError: 未能生成文本" error occurs ^[1]^[2].

To continue talking to Dosu, mention @dosu.

explodinggradients / ragas

Error running local LLM #1255