cannot run Llama-2-70b-hf

takitsuba commented 1 year ago

I cannot run Llama-2-70b-hf. The backend type is transformers. I tried to use multiple GPUs. If anyone knows how to solve this problem, please let me know.

sample code

import os
from llama2_wrapper import LLAMA2_WRAPPER, get_prompt

os.environ["CUDA_VISIBLE_DEVICES"]="4,5,6,7"

llama2_wrapper = LLAMA2_WRAPPER(
    model_path = "/home/takizawa/model/Llama-2-70b-hf",
  backend_type = "transformers",
  load_in_8bit = False
)

prompt = get_prompt("Hi do you know Pytorch?")
print(llama2_wrapper(prompt))

Error messages

../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [0,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [1,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [2,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.

( ... )

../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [95,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[5], line 4
      1 from llama2_wrapper import get_prompt
      3 prompt = get_prompt("Hi do you know Pytorch?")
----> 4 print(llama2_wrapper(prompt))

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/llama2_wrapper/model.py:363, in LLAMA2_WRAPPER.__call__(self, prompt, stream, max_new_tokens, temperature, top_p, top_k, repetition_penalty, **kwargs)
    361     return streamer
    362 else:
--> 363     output_ids = self.model.generate(
    364         **generate_kwargs,
    365     )
    366     output = self.tokenizer.decode(output_ids[0])
    367     return output.split("[/INST]")[1].split("</s>")[0]

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/transformers/generation/utils.py:1538, in GenerationMixin.generate(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, **kwargs)
   1532         raise ValueError(
   1533             "num_return_sequences has to be 1 when doing greedy search, "
   1534             f"but is {generation_config.num_return_sequences}."
   1535         )
   1537     # 11. run greedy search
-> 1538     return self.greedy_search(
   1539         input_ids,
   1540         logits_processor=logits_processor,
   1541         stopping_criteria=stopping_criteria,
   1542         pad_token_id=generation_config.pad_token_id,
   1543         eos_token_id=generation_config.eos_token_id,
   1544         output_scores=generation_config.output_scores,
   1545         return_dict_in_generate=generation_config.return_dict_in_generate,
   1546         synced_gpus=synced_gpus,
   1547         streamer=streamer,
   1548         **model_kwargs,
   1549     )
   1551 elif is_contrastive_search_gen_mode:
   1552     if generation_config.num_return_sequences > 1:

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/transformers/generation/utils.py:2362, in GenerationMixin.greedy_search(self, input_ids, logits_processor, stopping_criteria, max_length, pad_token_id, eos_token_id, output_attentions, output_hidden_states, output_scores, return_dict_in_generate, synced_gpus, streamer, **model_kwargs)
   2359 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
   2361 # forward pass to get next token
-> 2362 outputs = self(
   2363     **model_inputs,
   2364     return_dict=True,
   2365     output_attentions=output_attentions,
   2366     output_hidden_states=output_hidden_states,
   2367 )
   2369 if synced_gpus and this_peer_finished:
   2370     continue  # don't waste resources running the code we don't need

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/accelerate/hooks.py:165, in add_hook_to_module.<locals>.new_forward(*args, **kwargs)
    163         output = old_forward(*args, **kwargs)
    164 else:
--> 165     output = old_forward(*args, **kwargs)
    166 return module._hf_hook.post_forward(module, output)

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py:806, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)
    803 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    805 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
--> 806 outputs = self.model(
    807     input_ids=input_ids,
    808     attention_mask=attention_mask,
    809     position_ids=position_ids,
    810     past_key_values=past_key_values,
    811     inputs_embeds=inputs_embeds,
    812     use_cache=use_cache,
    813     output_attentions=output_attentions,
    814     output_hidden_states=output_hidden_states,
    815     return_dict=return_dict,
    816 )
    818 hidden_states = outputs[0]
    819 if self.pretraining_tp > 1:

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py:693, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)
    685     layer_outputs = torch.utils.checkpoint.checkpoint(
    686         create_custom_forward(decoder_layer),
    687         hidden_states,
   (...)
    690         None,
    691     )
    692 else:
--> 693     layer_outputs = decoder_layer(
    694         hidden_states,
    695         attention_mask=attention_mask,
    696         position_ids=position_ids,
    697         past_key_value=past_key_value,
    698         output_attentions=output_attentions,
    699         use_cache=use_cache,
    700     )
    702 hidden_states = layer_outputs[0]
    704 if use_cache:

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/accelerate/hooks.py:165, in add_hook_to_module.<locals>.new_forward(*args, **kwargs)
    163         output = old_forward(*args, **kwargs)
    164 else:
--> 165     output = old_forward(*args, **kwargs)
    166 return module._hf_hook.post_forward(module, output)

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py:408, in LlamaDecoderLayer.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache)
    405 hidden_states = self.input_layernorm(hidden_states)
    407 # Self Attention
--> 408 hidden_states, self_attn_weights, present_key_value = self.self_attn(
    409     hidden_states=hidden_states,
    410     attention_mask=attention_mask,
    411     position_ids=position_ids,
    412     past_key_value=past_key_value,
    413     output_attentions=output_attentions,
    414     use_cache=use_cache,
    415 )
    416 hidden_states = residual + hidden_states
    418 # Fully Connected

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/accelerate/hooks.py:165, in add_hook_to_module.<locals>.new_forward(*args, **kwargs)
    163         output = old_forward(*args, **kwargs)
    164 else:
--> 165     output = old_forward(*args, **kwargs)
    166 return module._hf_hook.post_forward(module, output)

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py:330, in LlamaAttention.forward(self, hidden_states, attention_mask, position_ids, past_key_value, output_attentions, use_cache)
    327 key_states = repeat_kv(key_states, self.num_key_value_groups)
    328 value_states = repeat_kv(value_states, self.num_key_value_groups)
--> 330 attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
    332 if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
    333     raise ValueError(
    334         f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
    335         f" {attn_weights.size()}"
    336     )

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasGemmStridedBatchedExFix( handle, opa, opb, m, n, k, (void*)(&falpha), a, CUDA_R_16F, lda, stridea, b, CUDA_R_16F, ldb, strideb, (void*)(&fbeta), c, CUDA_R_16F, ldc, stridec, num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`

version

Python 3.11.3

llama2-wrapper==0.1.8
torch==2.0.1
transformers==4.31.0

NVIDIA-SMI 465.19.01
Driver Version: 465.19.01
CUDA Version: 11.3

notes

This huggingface/transformers issue may be related to this issue.

takitsuba commented 1 year ago

Sorry, it seems that the slow communication speed between GPUs (without NVLink) was the cause of this issue. I will try again after improving the speed. I apologize for the inconvenience.

liltom-eth commented 1 year ago

@takitsuba made some updates on llama2-wrapper==0.1.9.

File ~/Projects/test_llama2wrapper/.venv/lib/python3.11/site-packages/llama2_wrapper/model.py:363, in LLAMA2_WRAPPER.__call__(self, prompt, stream, max_new_tokens, temperature, top_p, top_k, repetition_penalty, **kwargs)
    361     return streamer
    362 else:
    363     output_ids = self.model.generate(
    364         **generate_kwargs,
    365     )
    366     output = self.tokenizer.decode(output_ids[0])
  -->  367     return output.split("[/INST]")[1].split("</s>")[0]

now won't split output text instead using output = self.tokenizer.decode(output_ids[0][prompt_tokens_len:], skip_special_tokens=True) to skip the input prompt in the generation. It would be less flaky.

Multi GPUs issue is still hard to investigate. Have you tried Huggingface Text Inference on this powerful device?

takitsuba commented 1 year ago

Thank you for your reply! I will try 0.1.9.

I also tried Huggingface text generation and failed. I have come to feel that the fundamental cause of these issues is slow communication speed, after creating this issue🙇 Following is the result of p2pBandwidthLatencyTest. (0-1,2-3,4-5,6-7 GPU are connected with NVLink)

P2P=Enabled Latency (P2P Writes) Matrix (us)
   GPU     0      1      2      3      4      5      6      7
     0   2.51   2.47 49204.84 49204.84 49204.53 49204.35 49204.41 49203.94
     1   2.54   2.68 49204.94 49204.97 49204.91 49204.94 49204.95 49204.91
     2 49204.79 49204.79   2.34   2.45 49204.80 49204.79 49204.79 49204.80
     3 49204.95 49204.93   2.56   2.45 49204.96 49204.99 49204.96 49204.91
     4 49204.80 49204.90 49204.85 49204.82   2.42   2.42 49204.84 49204.80
     5 49204.77 49204.73 49204.70 49204.68   2.49   2.28 49204.75 49204.74
     6 49204.93 49204.83 49204.91 49204.89 49204.87 49204.88   2.27   2.41
     7 49204.91 49204.85 49204.85 49204.91 49204.84 49204.84   2.51   2.26

I think my problem is not due to llama2-wrapper, so I would like to close this issue.

liltom-eth / llama2-webui