Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0!

I tried to follow the information on readme. I created a L4 type of GPU (Based on Ada Lovelace architecture as mentioned by you) via Google Cloud workbench notebook with a lot of storage and ram too. I installed the package by doing this: pip install --pre --extra-index-url https://pypi.nvidia.com optimum-nvidia My python version is 3.10.13

I imported the imports mentioned in the read me: from optimum.nvidia import AutoModelForCausalLM from transformers import AutoTokenizer

I downloaded a regular LLM that even works without optimum. I want to use GPUs for much faster work and wanted to try optimum.

tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-7b-chat-norwegian", padding_side="left")

model = AutoModelForCausalLM.from_pretrained(
  "RuterNorway/Llama-2-7b-chat-norwegian",
 use_fp8=True,
 max_prompt_length=1024,
 max_output_length=2048, # Must be at least size of max_prompt_length + max_new_tokens
 max_batch_size=1,
)

model_inputs = tokenizer(["Hva henter Norges hovedstad?"], return_tensors="pt").to("cuda")

generated_ids = model.generate(
    **model_inputs, 
    top_k=40, 
    top_p=0.7, 
    repetition_penalty=10,
)

tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

After the model has been downloaded I get this: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

Full error output:

 0%|          | 0/512 [00:00<?, ?it/s]
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[4], line 4
      1 #os.environ['CUDA_VISIBLE_DEVICES'] ='0'
      2 tokenizer = AutoTokenizer.from_pretrained("RuterNorway/Llama-2-7b-chat-norwegian", padding_side="left")
----> 4 model = AutoModelForCausalLM.from_pretrained(
      5   "RuterNorway/Llama-2-7b-chat-norwegian",
      6  use_fp8=True,
      7  max_prompt_length=1024,
      8  max_output_length=2048, # Must be at least size of max_prompt_length + max_new_tokens
      9  max_batch_size=1,
     10 )
     12 model_inputs = tokenizer(["Hva henter Norges hovedstad?"], return_tensors="pt").to("cuda")
     14 generated_ids = model.generate(
     15     **model_inputs, 
     16     top_k=40, 
     17     top_p=0.7, 
     18     repetition_penalty=10,
     19 )

File /opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
    111 if check_use_auth_token:
    112     kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py:558, in ModelHubMixin.from_pretrained(cls, pretrained_model_name_or_path, force_download, resume_download, proxies, token, cache_dir, local_files_only, revision, **model_kwargs)
    555     if cls._hub_mixin_inject_config and "config" not in model_kwargs:
    556         model_kwargs["config"] = config
--> 558 instance = cls._from_pretrained(
    559     model_id=str(model_id),
    560     revision=revision,
    561     cache_dir=cache_dir,
    562     force_download=force_download,
    563     proxies=proxies,
    564     resume_download=resume_download,
    565     local_files_only=local_files_only,
    566     token=token,
    567     **model_kwargs,
    568 )
    570 # Implicitly set the config as instance attribute if not already set by the class
    571 # This way `config` will be available when calling `save_pretrained` or `push_to_hub`.
    572 if config is not None and (getattr(instance, "_hub_mixin_config", None) in (None, {})):

File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/models/auto.py:70, in AutoModelForCausalLM._from_pretrained(cls, model_id, revision, cache_dir, force_download, proxies, resume_download, local_files_only, token, config, **model_kwargs)
     67     raise UnsupportedModelException(model_type)
     69 model_clazz = _SUPPORTED_MODEL_CLASS[model_type]
---> 70 model = model_clazz.from_pretrained(
     71     pretrained_model_name_or_path=model_id,
     72     config=config,
     73     revision=revision,
     74     cache_dir=cache_dir,
     75     force_download=force_download,
     76     proxies=proxies,
     77     resume_download=resume_download,
     78     local_files_only=local_files_only,
     79     token=token,
     80     **model_kwargs,
     81 )
     83 return model

File /opt/conda/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:114, in validate_hf_hub_args.<locals>._inner_fn(*args, **kwargs)
    111 if check_use_auth_token:
    112     kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
--> 114 return fn(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/huggingface_hub/hub_mixin.py:558, in ModelHubMixin.from_pretrained(cls, pretrained_model_name_or_path, force_download, resume_download, proxies, token, cache_dir, local_files_only, revision, **model_kwargs)
    555     if cls._hub_mixin_inject_config and "config" not in model_kwargs:
    556         model_kwargs["config"] = config
--> 558 instance = cls._from_pretrained(
    559     model_id=str(model_id),
    560     revision=revision,
    561     cache_dir=cache_dir,
    562     force_download=force_download,
    563     proxies=proxies,
    564     resume_download=resume_download,
    565     local_files_only=local_files_only,
    566     token=token,
    567     **model_kwargs,
    568 )
    570 # Implicitly set the config as instance attribute if not already set by the class
    571 # This way `config` will be available when calling `save_pretrained` or `push_to_hub`.
    572 if config is not None and (getattr(instance, "_hub_mixin_config", None) in (None, {})):

File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/hub.py:427, in HuggingFaceHubModel._from_pretrained(cls, model_id, config, revision, cache_dir, force_download, proxies, resume_download, local_files_only, token, **model_kwargs)
    410         LOGGER.debug(
    411             f"Loading original transformers weights from the Hub ({model_id}@{revision})"
    412         )
    414         local_path = HuggingFaceHubModel.retrieve_snapshot_from_hub(
    415             model_id,
    416             revision,
   (...)
    423             prebuilt_engines_only=False,
    424         )
    426     checkpoint_folders, engines_folders, relative_paths_engines_folders = (
--> 427         cls.convert_and_build(local_path, config, **model_kwargs)
    428     )
    429 else:
    430     LOGGER.info(f"Found pre-built engines at: {engines_folders}")

File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/hub.py:302, in HuggingFaceHubModel.convert_and_build(cls, local_path, hf_model_config, engine_save_path, hf_model, config_class, **model_kwargs)
    285         warn(
    286             "Converting model to support float8 inference.\n"
    287             f"Calibrating model with dataset='c4', split='train', samples={len(qconfig.calibration_dataset)}.\n"
   (...)
    291             "forwarding the configuration to .from_pretrained(..., quantization_config=qconfig)"
    292         )
    294     hf_quantizer = AmmoQuantizer(
    295         quantization_config=qconfig,
    296         artifact_path=checkpoint_folder,
   (...)
    299         export_tensorrt_llm_config=True,
    300     )
--> 302     hf_quantizer.preprocess_model(hf_model, batch_size=1)
    303     hf_quantizer.postprocess_model(hf_model)
    305 else:
    306     # Apply the conversion from Hugging Face weights to TRTLLM

File /opt/conda/lib/python3.10/site-packages/transformers/quantizers/base.py:166, in HfQuantizer.preprocess_model(self, model, **kwargs)
    164 model.is_quantized = True
    165 model.quantization_method = self.quantization_config.quant_method
--> 166 return self._process_model_before_weight_loading(model, **kwargs)

File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/quantization/ammo/quantizer.py:125, in AmmoQuantizer._process_model_before_weight_loading(self, model, batch_size, **kwargs)
    120         inputs = {
    121             name: tensor.to("cuda:0") for name, tensor in sample.items()
    122         }
    123         model(**inputs)
--> 125 atq.quantize(model, config=qconfig.as_ammo_config(), forward_loop=_loop)

File /opt/conda/lib/python3.10/site-packages/ammo/torch/quantization/model_quant.py:112, in quantize(model, config, forward_loop)
     38 """Quantize and calibrate the model in place.
     39 
     40 This method performs in-place replacement of modules with their quantized counterparts and
   (...)
    109             Please subsample the dataset or reduce the number of batches if needed.
    110 """
    111 apply_mode(model, mode=[("quantize", config)])
--> 112 calibrate(model, config["algorithm"], forward_loop=forward_loop)
    113 return model

File ammo/torch/quantization/model_calib.py:63, in ammo.torch.quantization.model_calib.calibrate()

File /opt/conda/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File ammo/torch/quantization/model_calib.py:73, in ammo.torch.quantization.model_calib.max_calibrate()

File /opt/conda/lib/python3.10/site-packages/optimum/nvidia/quantization/ammo/quantizer.py:123, in AmmoQuantizer._process_model_before_weight_loading.<locals>._loop()
    119 for sample in tqdm(data):
    120     inputs = {
    121         name: tensor.to("cuda:0") for name, tensor in sample.items()
    122     }
--> 123     model(**inputs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File /opt/conda/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:1176, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
   1173 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
   1175 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1176 outputs = self.model(
   1177     input_ids=input_ids,
   1178     attention_mask=attention_mask,
   1179     position_ids=position_ids,
   1180     past_key_values=past_key_values,
   1181     inputs_embeds=inputs_embeds,
   1182     use_cache=use_cache,
   1183     output_attentions=output_attentions,
   1184     output_hidden_states=output_hidden_states,
   1185     return_dict=return_dict,
   1186     cache_position=cache_position,
   1187 )
   1189 hidden_states = outputs[0]
   1190 if self.config.pretraining_tp > 1:

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File /opt/conda/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:977, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
    974     use_cache = False
    976 if inputs_embeds is None:
--> 977     inputs_embeds = self.embed_tokens(input_ids)
    979 past_seen_tokens = 0
    980 if use_cache:  # kept for BC (cache positions)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1511, in Module._wrapped_call_impl(self, *args, **kwargs)
   1509     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1510 else:
-> 1511     return self._call_impl(*args, **kwargs)

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1520, in Module._call_impl(self, *args, **kwargs)
   1515 # If we don't have any hooks, we want to skip the rest of the logic in
   1516 # this function, and just call forward.
   1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1518         or _global_backward_pre_hooks or _global_backward_hooks
   1519         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520     return forward_call(*args, **kwargs)
   1522 try:
   1523     result = None

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/sparse.py:163, in Embedding.forward(self, input)
    162 def forward(self, input: Tensor) -> Tensor:
--> 163     return F.embedding(
    164         input, self.weight, self.padding_idx, self.max_norm,
    165         self.norm_type, self.scale_grad_by_freq, self.sparse)

File /opt/conda/lib/python3.10/site-packages/torch/nn/functional.py:2237, in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
   2231     # Note [embedding_renorm set_grad_enabled]
   2232     # XXX: equivalent to
   2233     # with torch.no_grad():
   2234     #   torch.embedding_renorm_
   2235     # remove once script supports set_grad_enabled
   2236     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2237 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

huggingface / optimum-nvidia

Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! #141