What is best practice to Quantize a common 16Bit 70B Model on a single A100 VM?
What code do you use when you have a 70B e.g. Unsloth fine-tuned model (merged Adapter)?
With the default code I get:
Repo card metadata block was not found. Setting CardData to empty.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
{
"name": "RuntimeError",
"message": "Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)",
"stack": "---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[1], line 26
23 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
25 # Quantize
---> 26 model.quantize(tokenizer, quant_config=quant_config)
28 # Save quantized model
29 model.save_quantized(quant_path)
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/awq/models/base.py:211, in BaseAWQForCausalLM.quantize(self, tokenizer, quant_config, calib_data, split, text_column, duo_scaling, export_compatible, apply_clip, n_parallel_calib_samples, max_calib_samples, max_calib_seq_len, max_chunk_memory)
208 if hasattr(self, \"modules_to_not_convert\"):
209 self.quant_config.modules_to_not_convert = self.modules_to_not_convert
--> 211 self.quantizer = AwqQuantizer(
212 self,
213 self.model,
214 tokenizer,
215 self.quant_config.w_bit,
216 self.quant_config.q_group_size,
217 self.quant_config.zero_point,
218 self.quant_config.version,
219 calib_data,
220 split,
221 text_column,
222 duo_scaling,
223 modules_to_not_convert=self.quant_config.modules_to_not_convert,
224 export_compatible=export_compatible,
225 apply_clip=apply_clip,
226 n_parallel_calib_samples=n_parallel_calib_samples,
227 max_calib_samples=max_calib_samples,
228 max_calib_seq_len=max_calib_seq_len,
229 max_chunk_memory=max_chunk_memory,
230 )
231 self.quantizer.quantize()
233 self.is_quantized = True
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/awq/quantize/quantizer.py:69, in AwqQuantizer.__init__(self, awq_model, model, tokenizer, w_bit, group_size, zero_point, version, calib_data, split, text_column, duo_scaling, modules_to_not_convert, export_compatible, apply_clip, n_parallel_calib_samples, max_calib_samples, max_calib_seq_len, max_chunk_memory)
65 self.max_chunk_memory = max_chunk_memory
66 self.modules_to_not_convert = (
67 modules_to_not_convert if modules_to_not_convert is not None else []
68 )
---> 69 self.modules, self.module_kwargs, self.inps = self.init_quant(
70 n_samples=self.max_calib_samples, max_seq_len=self.max_calib_seq_len
71 )
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/awq/quantize/quantizer.py:570, in AwqQuantizer.init_quant(self, n_samples, max_seq_len)
568 modules[0] = Catcher(modules[0])
569 try:
--> 570 self.model(samples.to(next(self.model.parameters()).device))
571 except ValueError: # work with early exit
572 pass
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:1189, in LlamaForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
1186 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1188 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 1189 outputs = self.model(
1190 input_ids=input_ids,
1191 attention_mask=attention_mask,
1192 position_ids=position_ids,
1193 past_key_values=past_key_values,
1194 inputs_embeds=inputs_embeds,
1195 use_cache=use_cache,
1196 output_attentions=output_attentions,
1197 output_hidden_states=output_hidden_states,
1198 return_dict=return_dict,
1199 cache_position=cache_position,
1200 )
1202 hidden_states = outputs[0]
1203 if self.config.pretraining_tp > 1:
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:977, in LlamaModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict, cache_position)
974 hidden_states = inputs_embeds
976 # create position embeddings to be shared across the decoder layers
--> 977 position_embeddings = self.rotary_emb(hidden_states, position_ids)
979 # decoder layers
980 all_hidden_states = () if output_hidden_states else None
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/torch/utils/_contextlib.py:115, in context_decorator.<locals>.decorate_context(*args, **kwargs)
112 @functools.wraps(func)
113 def decorate_context(*args, **kwargs):
114 with ctx_factory():
--> 115 return func(*args, **kwargs)
File ~/code/genai-ml/.venv/lib/python3.10/site-packages/transformers/models/llama/modeling_llama.py:209, in LlamaRotaryEmbedding.forward(self, x, position_ids)
207 device_type = device_type if isinstance(device_type, str) and device_type != \"mps\" else \"cpu\"
208 with torch.autocast(device_type=device_type, enabled=False):
--> 209 freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
210 emb = torch.cat((freqs, freqs), dim=-1)
211 cos = emb.cos()
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)"
}
What is best practice to Quantize a common 16Bit 70B Model on a single A100 VM? What code do you use when you have a 70B e.g. Unsloth fine-tuned model (merged Adapter)?
With the default code I get: