I have fine-tuned Llama-2 13-B chat model saved locally in my system [ Linux OS ]

When I am running the following code :-

from model.ea_model import EaModel from pathlib import Path import torch from fastchat.model import get_conversation_template import os device = "cuda:0" base_model_path="llama" EAGLE_model_path="yuhuili/EAGLE-llama2-chat-13B" model = EaModel.from_pretrained( base_model_path=base_model_path, ea_model_path=EAGLE_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True,

map_location='cuda:0'

device_map=device

) model.eval()

ues_llama_2_chat=1 use_vicuna=0 your_message="Hello"

if ues_llama_2_chat: conv = get_conversation_template("llama-2-chat")
sys_p = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information." conv.system_message = sys_p conv.append_message(conv.roles[0], your_message) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() + " "

if use_vicuna: conv = get_conversation_template("vicuna") conv.append_message(conv.roles[0], your_message) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt()

input_ids=model.tokenizer([prompt]).input_ids input_ids = torch.as_tensor(input_ids).to(device) output_ids=model.eagenerate(input_ids,temperature=0.5,max_new_tokens=512) output=model.tokenizer.decode(output_ids[0]) print(output)

I am getting the following error -

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [100,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [101,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [102,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [103,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [104,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [105,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [106,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [107,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [108,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [109,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [110,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [111,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [112,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [113,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [114,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [115,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [116,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [117,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [118,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [119,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [120,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [121,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [122,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [123,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [124,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [125,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [126,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [267,0,0], thread: [127,0,0] Assertion srcIndex < srcSelectDimSize failed. Traceback (most recent call last): File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/sample_code.py", line 45, in output_ids=model.eagenerate(input_ids,temperature=0.5,max_new_tokens=512) File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(*args, kwargs) File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/ea_model.py", line 218, in eagenerate tree_logits, logits, hidden_state, sample_token = initialize_tree( File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/utils.py", line 164, in initialize_tree tree_logits, outputs, logits,hidden_state,sample_token = model( File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1501, in _call_impl return forward_call(*args, *kwargs) File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/ea_model.py", line 159, in forward ea_logits = self.ea_layer.topK_genrate(hidden_states, input_ids, self.base_model.lm_head, logits_processor) File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context return func(args, kwargs) File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/cnets.py", line 867, in topK_genrate topk_index,topk_prob=self.sample(last_headout,logits_processor,k=top_k,) File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/cnets.py", line 805, in sample sampled_indices = torch.multinomial(probabilities, k, replacement=False) RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

When I am running the above code after setting os.environ["CUDA_VISIBLE_DEVICES"] = "1", I am getting the following runtime error -

Traceback (most recent call last): File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/sample_code.py", line 12, in model = EaModel.from_pretrained( File "/home/azureuser/tensorrtllm_backend/llama-experiments/EAGLE/model/ea_model.py", line 118, in from_pretrained ea_layer_state_dict = torch.load(load_model_path, File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 809, in load return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args) File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 1172, in _load result = unpickler.load() File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 1142, in persistent_load typed_storage = load_tensor(dtype, nbytes, key, _maybe_decode_ascii(location)) File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 1116, in load_tensor wrap_storage=restore_location(storage, location), File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 1086, in restore_location return default_restore_location(storage, str(map_location)) File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 217, in default_restore_location result = fn(storage, location) File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 182, in _cuda_deserialize device = validate_cuda_device(location) File "/usr/local/lib/python3.10/dist-packages/torch/serialization.py", line 173, in validate_cuda_device raise RuntimeError('Attempting to deserialize object on CUDA device ' RuntimeError: Attempting to deserialize object on CUDA device 0 but torch.cuda.device_count() is 0. Please use torch.load with map_location to map your storages to an existing device.

SafeAILab / EAGLE

Weired Runtime Error during Inference #19

map_location='cuda:0'