Closed nss-programmer closed 8 months ago
Attempting to deserialize object on CUDA device 0 but torch.cuda.device_count() is 0.
It seems that your GPU is unavailable
When I am loading the finetuned model from my local disk am facing the above issue , but when I try to load a pre-trained model from Huggingface am not facing any issue
I have fine-tuned Llama-2 13-B chat model saved locally in my system [ Linux OS ]
When I am running the following code :-
from model.ea_model import EaModel from pathlib import Path import torch from fastchat.model import get_conversation_template import os device = "cuda:0" base_model_path="llama" EAGLE_model_path="yuhuili/EAGLE-llama2-chat-13B" model = EaModel.from_pretrained( base_model_path=base_model_path, ea_model_path=EAGLE_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True,
map_location='cuda:0'
) model.eval()
ues_llama_2_chat=1 use_vicuna=0 your_message="Hello"
if ues_llama_2_chat: conv = get_conversation_template("llama-2-chat")
sys_p = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." conv.system_message = sys_p conv.append_message(conv.roles[0], your_message) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() + " "
if use_vicuna: conv = get_conversation_template("vicuna") conv.append_message(conv.roles[0], your_message) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt()
input_ids=model.tokenizer([prompt]).input_ids input_ids = torch.as_tensor(input_ids).to(device) output_ids=model.eagenerate(input_ids,temperature=0.5,max_new_tokens=512) output=model.tokenizer.decode(output_ids[0]) print(output)
I am getting the following error -
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [100,0,0] Assertion
output_ids=model.eagenerate(input_ids,temperature=0.5,max_new_tokens=512)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, kwargs)
File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/ea_model.py", line 218, in eagenerate
tree_logits, logits, hidden_state, sample_token = initialize_tree(
File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/utils.py", line 164, in initialize_tree
tree_logits, outputs, logits,hidden_state,sample_token = model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/ea_model.py", line 159, in forward
ea_logits = self.ea_layer.topK_genrate(hidden_states, input_ids, self.base_model.lm_head, logits_processor)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(args, kwargs)
File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/cnets.py", line 867, in topK_genrate
topk_index,topk_prob=self.sample(last_headout,logits_processor,k=top_k,)
File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/cnets.py", line 805, in sample
sampled_indices = torch.multinomial(probabilities, k, replacement=False)
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with
srcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [101,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [102,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [103,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [104,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [105,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [106,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [107,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [108,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [109,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [110,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [111,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [112,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [113,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [114,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [115,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [116,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [117,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [118,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [119,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [120,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [121,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [122,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [123,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [124,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [125,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [126,0,0] AssertionsrcIndex < srcSelectDimSize
failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [127,0,0] AssertionsrcIndex < srcSelectDimSize
failed. Traceback (most recent call last): File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/sample_code.py", line 45, inTORCH_USE_CUDA_DSA
to enable device-side assertions.When I am running the above code after setting os.environ["CUDA_VISIBLE_DEVICES"] = "1", I am getting the following runtime error -
Traceback (most recent call last): File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/sample_code.py", line 12, in
model = EaModel.from_pretrained(
File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/ea_model.py", line 118, in from_pretrained
ea_layer_state_dict = torch.load(load_model_path,
File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 809, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 1172, in _load
result = unpickler.load()
File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 1142, in persistent_load
typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location))
File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 1116, in load_tensor
wrap_storage=restore_location(storage, location),
File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 1086, in restore_location
return default_restore_location(storage, str(map_location))
File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 217, in default_restore_location
result = fn(storage, location)
File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 182, in _cuda_deserialize
device = validate_cuda_device(location)
File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 173, in validate_cuda_device
raise RuntimeError('Attempting to deserialize object on CUDA device '
RuntimeError: Attempting to deserialize object on CUDA device 0 but torch.cuda.device_count() is 0. Please use torch.load with map_location to map your storages to an existing device.