I tried to follow the information on readme.
I created a L4 type of GPU (Based on Ada Lovelace architecture as mentioned by you) via Google Cloud workbench notebook with a lot of storage and ram too.
I installed the package by doing this: pip install --pre --extra-index-url https://pypi.nvidia.com optimum-nvidia
My python version is 3.10.13
I imported the imports mentioned in the read me:
from optimum.nvidia import AutoModelForCausalLMfrom transformers import AutoTokenizer
I downloaded a regular LLM that even works without optimum. I want to use GPUs for much faster work and wanted to try optimum.
tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-7b-chat-norwegian", padding_side="left")
model = AutoModelForCausalLM.from_pretrained(
"RuterNorway/Llama-2-7b-chat-norwegian",
use_fp8=True,
max_prompt_length=1024,
max_output_length=2048, # Must be at least size of max_prompt_length + max_new_tokens
max_batch_size=1,
)
model_inputs = tokenizer(["Hva henter Norges hovedstad?"], return_tensors="pt").to("cuda")
generated_ids = model.generate(
**model_inputs,
top_k=40,
top_p=0.7,
repetition_penalty=10,
)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
After the model has been downloaded I get this:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
Full error output:
0%| | 0/512 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[4], line 4
1 #os.environ['CUDA_VISIBLE_DEVICES'] ='0'
2 tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-7b-chat-norwegian", padding_side="left")
----> 4 model = AutoModelForCausalLM.from_pretrained(
5 "RuterNorway/Llama-2-7b-chat-norwegian",
6 use_fp8=True,
7 max_prompt_length=1024,
8 max_output_length=2048, # Must be at least size of max_prompt_length + max_new_tokens
9 max_batch_size=1,
10 )
12 model_inputs = tokenizer(["Hva henter Norges hovedstad?"], return_tensors="pt").to("cuda")
14 generated_ids = model.generate(
15 **model_inputs,
16 top_k=40,
17 top_p=0.7,
18 repetition_penalty=10,
19 )
File /opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py:558, in ModelHubMixin.from_pretrained(cls, pretrained_model_name_or_path, force_download, resume_download, proxies, token, cache_dir, local_files_only, revision, **model_kwargs)
555 if cls._hub_mixin_inject_config and "config" not in model_kwargs:
556 model_kwargs["config"] = config
--> 558 instance = cls._from_pretrained(
559 model_id=str(model_id),
560 revision=revision,
561 cache_dir=cache_dir,
562 force_download=force_download,
563 proxies=proxies,
564 resume_download=resume_download,
565 local_files_only=local_files_only,
566 token=token,
567 **model_kwargs,
568 )
570 # Implicitly set the config as instance attribute if not already set by the class
571 # This way `config` will be available when calling `save_pretrained` or `push_to_hub`.
572 if config is not None and (getattr(instance, "_hub_mixin_config", None) in (None, {})):
File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/models/auto.py:70, in AutoModelForCausalLM._from_pretrained(cls, model_id, revision, cache_dir, force_download, proxies, resume_download, local_files_only, token, config, **model_kwargs)
67 raise UnsupportedModelException(model_type)
69 model_clazz = _SUPPORTED_MODEL_CLASS[model_type]
---> 70 model = model_clazz.from_pretrained(
71 pretrained_model_name_or_path=model_id,
72 config=config,
73 revision=revision,
74 cache_dir=cache_dir,
75 force_download=force_download,
76 proxies=proxies,
77 resume_download=resume_download,
78 local_files_only=local_files_only,
79 token=token,
80 **model_kwargs,
81 )
83 return model
File /opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
111 if check_use_auth_token:
112 kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py:558, in ModelHubMixin.from_pretrained(cls, pretrained_model_name_or_path, force_download, resume_download, proxies, token, cache_dir, local_files_only, revision, **model_kwargs)
555 if cls._hub_mixin_inject_config and "config" not in model_kwargs:
556 model_kwargs["config"] = config
--> 558 instance = cls._from_pretrained(
559 model_id=str(model_id),
560 revision=revision,
561 cache_dir=cache_dir,
562 force_download=force_download,
563 proxies=proxies,
564 resume_download=resume_download,
565 local_files_only=local_files_only,
566 token=token,
567 **model_kwargs,
568 )
570 # Implicitly set the config as instance attribute if not already set by the class
571 # This way `config` will be available when calling `save_pretrained` or `push_to_hub`.
572 if config is not None and (getattr(instance, "_hub_mixin_config", None) in (None, {})):
File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/hub.py:427, in HuggingFaceHubModel._from_pretrained(cls, model_id, config, revision, cache_dir, force_download, proxies, resume_download, local_files_only, token, **model_kwargs)
410 LOGGER.debug(
411 f"Loading original transformers weights from the Hub ({model_id}@{revision})"
412 )
414 local_path = HuggingFaceHubModel.retrieve_snapshot_from_hub(
415 model_id,
416 revision,
(...)
423 prebuilt_engines_only=False,
424 )
426 checkpoint_folders, engines_folders, relative_paths_engines_folders = (
--> 427 cls.convert_and_build(local_path, config, **model_kwargs)
428 )
429 else:
430 LOGGER.info(f"Found pre-built engines at: {engines_folders}")
File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/hub.py:302, in HuggingFaceHubModel.convert_and_build(cls, local_path, hf_model_config, engine_save_path, hf_model, config_class, **model_kwargs)
285 warn(
286 "Converting model to support float8 inference.\n"
287 f"Calibrating model with dataset='c4', split='train', samples={len(qconfig.calibration_dataset)}.\n"
(...)
291 "forwarding the configuration to .from_pretrained(..., quantization_config=qconfig)"
292 )
294 hf_quantizer = AmmoQuantizer(
295 quantization_config=qconfig,
296 artifact_path=checkpoint_folder,
(...)
299 export_tensorrt_llm_config=True,
300 )
--> 302 hf_quantizer.preprocess_model(hf_model, batch_size=1)
303 hf_quantizer.postprocess_model(hf_model)
305 else:
306 # Apply the conversion from Hugging Face weights to TRTLLM
File /opt/conda/lib/python3.10/site-packages/transformers/quantizers/base.py:166, in HfQuantizer.preprocess_model(self, model, **kwargs)
164 model.is_quantized = True
165 model.quantization_method = self.quantization_config.quant_method
--> 166 return self._process_model_before_weight_loading(model, **kwargs)
File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/quantization/ammo/quantizer.py:125, in AmmoQuantizer._process_model_before_weight_loading(self, model, batch_size, **kwargs)
120 inputs = {
121 name: tensor.to("cuda:0") for name, tensor in sample.items()
122 }
123 model(**inputs)
--> 125 atq.quantize(model, config=qconfig.as_ammo_config(), forward_loop=_loop)
File /opt/conda/lib/python3.10/site-packages/ammo/torch/quantization/model_quant.py:112, in quantize(model, config, forward_loop)
38 """Quantize and calibrate the model in place.
39
40 This method performs in-place replacement of modules with their quantized counterparts and
(...)
109 Please subsample the dataset or reduce the number of batches if needed.
110 """
111 apply_mode(model, mode=[("quantize", config)])
--> 112 calibrate(model, config["algorithm"], forward_loop=forward_loop)
113 return model
File ammo/torch/quantization/model_calib.py:63, in ammo.torch.quantization.model_calib.calibrate()
File /opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ammo/torch/quantization/model_calib.py:73, in ammo.torch.quantization.model_calib.max_calibrate()
File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/quantization/ammo/quantizer.py:123, in AmmoQuantizer._process_model_before_weight_loading.<locals>._loop()
119 for sample in tqdm(data):
120 inputs = {
121 name: tensor.to("cuda:0") for name, tensor in sample.items()
122 }
--> 123 model(**inputs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File /opt/conda/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:1176, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
1173 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1175 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1176 outputs = self.model(
1177 input_ids=input_ids,
1178 attention_mask=attention_mask,
1179 position_ids=position_ids,
1180 past_key_values=past_key_values,
1181 inputs_embeds=inputs_embeds,
1182 use_cache=use_cache,
1183 output_attentions=output_attentions,
1184 output_hidden_states=output_hidden_states,
1185 return_dict=return_dict,
1186 cache_position=cache_position,
1187 )
1189 hidden_states = outputs[0]
1190 if self.config.pretraining_tp > 1:
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File /opt/conda/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:977, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
974 use_cache = False
976 if inputs_embeds is None:
--> 977 inputs_embeds = self.embed_tokens(input_ids)
979 past_seen_tokens = 0
980 if use_cache: # kept for BC (cache positions)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/sparse.py:163, in Embedding.forward(self, input)
162 def forward(self, input: Tensor) -> Tensor:
--> 163 return F.embedding(
164 input, self.weight, self.padding_idx, self.max_norm,
165 self.norm_type, self.scale_grad_by_freq, self.sparse)
File /opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:2237, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2231 # Note [embedding_renorm set_grad_enabled]
2232 # XXX: equivalent to
2233 # with torch.no_grad():
2234 # torch.embedding_renorm_
2235 # remove once script supports set_grad_enabled
2236 _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2237 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
I tried to follow the information on readme. I created a L4 type of GPU (Based on Ada Lovelace architecture as mentioned by you) via Google Cloud workbench notebook with a lot of storage and ram too. I installed the package by doing this:
pip install --pre --extra-index-url https://pypi.nvidia.com optimum-nvidia
My python version is 3.10.13I imported the imports mentioned in the read me:
from optimum.nvidia import AutoModelForCausalLM
from transformers import AutoTokenizer
I downloaded a regular LLM that even works without optimum. I want to use GPUs for much faster work and wanted to try optimum.
After the model has been downloaded I get this:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)
Full error output: