Open alvaropastor7 opened 6 hours ago
Hi Im trying to do inference on a awq quantized model and im constantly getting this error when trying to generate text. Im using Qwen2.5-72B-Instruct-AWQ. Some code to give context:
self._model = AutoAWQForCausalLM.from_pretrained( model_name, device_map="auto", token=hf_token, attn_implementation="flash_attention_2", torch_dtype = torch.float16 ) # Cargar el tokenizer self._tokenizer = AutoTokenizer.from_pretrained( model_name, token=hf_token ) def _call(self, prompt: str, **kwargs) -> str: # Parámetros predeterminados para generación max_new_tokens = kwargs.get("max_new_tokens", 400) temperature = kwargs.get("temperature", 1) top_k = kwargs.get("top_k", 30) model = self.load_model() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Codificar el prompt inputs = self._tokenizer(prompt, return_tensors="pt").to(device) input_ids = inputs["input_ids"] attention_mask = inputs["attention_mask"] # Parámetros para la generación generation_params = { "input_ids": input_ids, "attention_mask": attention_mask, "max_new_tokens": max_new_tokens, "temperature": temperature, "do_sample": True, "top_k": top_k, "eos_token_id": self._tokenizer.eos_token_id, "pad_token_id": self._tokenizer.eos_token_id, "top_p": 0.95, "repetition_penalty": 1.2, } # Generar texto outputs = model.generate(**generation_params) # Decodificar la salida generated_text = self._tokenizer.decode( outputs[0], skip_special_tokens=True ) return generated_text[len(prompt):].strip()
Thanks :)
Please use from_quantized instead of from_pretrained
from_quantized
from_pretrained
Hi Im trying to do inference on a awq quantized model and im constantly getting this error when trying to generate text. Im using Qwen2.5-72B-Instruct-AWQ. Some code to give context:
Thanks :)