Quantize a 70B model on a 80GB VRAM VM

What is best practice to Quantize a common 16Bit 70B Model on a single A100 VM? What code do you use when you have a 70B e.g. Unsloth fine-tuned model (merged Adapter)?

With the default code I get:

Repo card metadata block was not found. Setting CardData to empty.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

{
"name": "RuntimeError",
"message": "Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)",
    "stack": "---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[1], line 26
     23 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
     25 # Quantize
---> 26 model.quantize(tokenizer, quant_config=quant_config)
     28 # Save quantized model
     29 model.save_quantized(quant_path)

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/awq/models/base.py:211, in BaseAWQForCausalLM.quantize(self, tokenizer, quant_config, calib_data, split, text_column, duo_scaling, export_compatible, apply_clip, n_parallel_calib_samples, max_calib_samples, max_calib_seq_len, max_chunk_memory)
    208 if hasattr(self, \"modules_to_not_convert\"):
    209     self.quant_config.modules_to_not_convert = self.modules_to_not_convert
--> 211 self.quantizer = AwqQuantizer(
    212     self,
    213     self.model,
    214     tokenizer,
    215     self.quant_config.w_bit,
    216     self.quant_config.q_group_size,
    217     self.quant_config.zero_point,
    218     self.quant_config.version,
    219     calib_data,
    220     split,
    221     text_column,
    222     duo_scaling,
    223     modules_to_not_convert=self.quant_config.modules_to_not_convert,
    224     export_compatible=export_compatible,
    225     apply_clip=apply_clip,
    226     n_parallel_calib_samples=n_parallel_calib_samples,
    227     max_calib_samples=max_calib_samples,
    228     max_calib_seq_len=max_calib_seq_len,
    229     max_chunk_memory=max_chunk_memory,
    230 )
    231 self.quantizer.quantize()
    233 self.is_quantized = True

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/awq/quantize/quantizer.py:69, in AwqQuantizer.__init__(self, awq_model, model, tokenizer, w_bit, group_size, zero_point, version, calib_data, split, text_column, duo_scaling, modules_to_not_convert, export_compatible, apply_clip, n_parallel_calib_samples, max_calib_samples, max_calib_seq_len, max_chunk_memory)
     65 self.max_chunk_memory = max_chunk_memory
     66 self.modules_to_not_convert = (
     67     modules_to_not_convert if modules_to_not_convert is not None else []
     68 )
---> 69 self.modules, self.module_kwargs, self.inps = self.init_quant(
     70     n_samples=self.max_calib_samples, max_seq_len=self.max_calib_seq_len
     71 )

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/awq/quantize/quantizer.py:570, in AwqQuantizer.init_quant(self, n_samples, max_seq_len)
    568 modules[0] = Catcher(modules[0])
    569 try:
--> 570     self.model(samples.to(next(self.model.parameters()).device))
    571 except ValueError:  # work with early exit
    572     pass

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:1189, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
   1186 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
   1188 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1189 outputs = self.model(
   1190     input_ids=input_ids,
   1191     attention_mask=attention_mask,
   1192     position_ids=position_ids,
   1193     past_key_values=past_key_values,
   1194     inputs_embeds=inputs_embeds,
   1195     use_cache=use_cache,
   1196     output_attentions=output_attentions,
   1197     output_hidden_states=output_hidden_states,
   1198     return_dict=return_dict,
   1199     cache_position=cache_position,
   1200 )
   1202 hidden_states = outputs[0]
   1203 if self.config.pretraining_tp > 1:

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:977, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
    974 hidden_states = inputs_embeds
    976 # create position embeddings to be shared across the decoder layers
--> 977 position_embeddings = self.rotary_emb(hidden_states, position_ids)
    979 # decoder layers
    980 all_hidden_states = () if output_hidden_states else None

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    112 @functools.wraps(func)
    113 def decorate_context(*args, **kwargs):
    114     with ctx_factory():
--> 115         return func(*args, **kwargs)

File ~/code/genai-ml/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:209, in LlamaRotaryEmbedding.forward(self, x, position_ids)
    207 device_type = device_type if isinstance(device_type, str) and device_type != \"mps\" else \"cpu\"
    208 with torch.autocast(device_type=device_type, enabled=False):
--> 209     freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
    210     emb = torch.cat((freqs, freqs), dim=-1)
    211     cos = emb.cos()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)"
}

casper-hansen / AutoAWQ

Quantize a 70B model on a 80GB VRAM VM #585