Closed kadirnar closed 6 months ago
Hi @kadirnar
Could you provide the full error log, thanks!
Hi @kadirnar
Could you provide the full error log, thanks!
Install:
!pip install transformers bitsandbytes accelerate
!pip install flash-attn --no-build-isolation
I made a mistake in the error message. My error message:
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You shouldn't move a model that is dispatched using accelerate hooks.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[3], line 22
12 model = AutoModelForSpeechSeq2Seq.from_pretrained(
13 model_id,
14 quantization_config=bnb_config,
(...)
17 attn_implementation="flash_attention_2",
18 )
20 processor = AutoProcessor.from_pretrained(model_id)
---> 22 pipe = model(
23 "automatic-speech-recognition",
24 model=model,
25 chunk_length_s=30,
26 max_new_tokens=128,
27 batch_size=24,
28 return_timestamps=True,
29 tokenizer=processor.tokenizer,
30 feature_extractor=processor.feature_extractor,
31 model_kwargs={"use_flash_attention_2": True},
32 generate_kwargs={"language": "english"},
33 )
File [/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1511](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py#line=1510), in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File [/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1520](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py#line=1519), in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
File [/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:161](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py#line=160), in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
160 def new_forward(module, *args, **kwargs):
--> 161 args, kwargs = module._hf_hook.pre_forward(module, *args, **kwargs)
162 if module._hf_hook.no_grad:
163 with torch.no_grad():
File [/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py:356](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py#line=355), in AlignDevicesHook.pre_forward(self, module, *args, **kwargs)
345 self.tied_pointers_to_remove.add((value.data_ptr(), self.execution_device))
347 set_module_tensor_to_device(
348 module,
349 name,
(...)
353 tied_params_map=self.tied_params_map,
354 )
--> 356 return send_to_device(args, self.execution_device), send_to_device(
357 kwargs, self.execution_device, skip_keys=self.skip_keys
358 )
File [/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:189](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py#line=188), in send_to_device(tensor, device, non_blocking, skip_keys)
186 elif skip_keys is None:
187 skip_keys = []
188 return type(tensor)(
--> 189 {
190 k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys)
191 for k, t in tensor.items()
192 }
193 )
194 else:
195 return tensor
File [/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:190](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py#line=189), in <dictcomp>(.0)
186 elif skip_keys is None:
187 skip_keys = []
188 return type(tensor)(
189 {
--> 190 k: t if k in skip_keys else send_to_device(t, device, non_blocking=non_blocking, skip_keys=skip_keys)
191 for k, t in tensor.items()
192 }
193 )
194 else:
195 return tensor
File [/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:174](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py#line=173), in send_to_device(tensor, device, non_blocking, skip_keys)
172 device = f"xpu:{device}"
173 else:
--> 174 raise error
175 try:
176 return tensor.to(device, non_blocking=non_blocking)
File [/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py:158](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/accelerate/utils/operations.py#line=157), in send_to_device(tensor, device, non_blocking, skip_keys)
156 tensor = tensor.cpu()
157 try:
--> 158 return tensor.to(device, non_blocking=non_blocking)
159 except TypeError: # .to() doesn't accept non_blocking as kwarg
160 return tensor.to(device)
File [/usr/local/lib/python3.10/dist-packages/accelerate/big_modeling.py:456](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/accelerate/big_modeling.py#line=455), in dispatch_model.<locals>.add_warning.<locals>.wrapper(*args, **kwargs)
454 if param.device == torch.device("meta"):
455 raise RuntimeError("You can't move a model that has some modules offloaded to cpu or disk.")
--> 456 return fn(*args, **kwargs)
File [/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:2670](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py#line=2669), in PreTrainedModel.to(self, *args, **kwargs)
2666 @wraps(torch.nn.Module.to)
2667 def to(self, *args, **kwargs):
2668 # Checks if the model has been loaded in 8-bit
2669 if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
-> 2670 raise ValueError(
2671 "`.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the"
2672 " model has already been set to the correct devices and casted to the correct `dtype`."
2673 )
2674 elif getattr(self, "quantization_method", None) == QuantizationMethod.GPTQ:
2675 # For GPTQ models, we prevent users from casting the model to another dytpe to restrict unwanted behaviours.
2676 # the correct API should be to load the model with the desired dtype directly through `from_pretrained`.
2677 dtype_present_in_args = False
ValueError: `.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the model has already been set to the correct devices and casted to the correct `dtype`.
If I don't use the bnb_config parameter I get this error.
You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in WhisperForConditionalGeneration is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in WhisperModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in WhisperEncoder is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in WhisperDecoder is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[4], line 21
12 model = AutoModelForSpeechSeq2Seq.from_pretrained(
13 model_id,
14 low_cpu_mem_usage=True,
15 use_safetensors=True,
16 attn_implementation="flash_attention_2",
17 )
19 processor = AutoProcessor.from_pretrained(model_id)
---> 21 pipe = model(
22 "automatic-speech-recognition",
23 model=model,
24 chunk_length_s=30,
25 max_new_tokens=128,
26 batch_size=24,
27 return_timestamps=True,
28 tokenizer=processor.tokenizer,
29 feature_extractor=processor.feature_extractor,
30 model_kwargs={"use_flash_attention_2": True},
31 generate_kwargs={"language": "english"},
32 )
File [/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1511](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py#line=1510), in Module._wrapped_call_impl(self, *args, **kwargs)
1509 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1510 else:
-> 1511 return self._call_impl(*args, **kwargs)
File [/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1520](https://e44v1l2s1bp2my-8888.proxy.runpod.net/doc/tree/workspace/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py#line=1519), in Module._call_impl(self, *args, **kwargs)
1515 # If we don't have any hooks, we want to skip the rest of the logic in
1516 # this function, and just call forward.
1517 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1518 or _global_backward_pre_hooks or _global_backward_hooks
1519 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1520 return forward_call(*args, **kwargs)
1522 try:
1523 result = None
TypeError: WhisperForConditionalGeneration.forward() got an unexpected keyword argument 'model'
Hi @kadirnar Can you try:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, BitsAndBytesConfig
import torch
model_id = "distil-whisper/distil-large-v3"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
quantization_config=bnb_config,
low_cpu_mem_usage=True,
use_safetensors=True,
attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(model_id)
+ pipe = pipeline(
- pipe = model(
"automatic-speech-recognition",
model=model,
chunk_length_s=30,
max_new_tokens=128,
batch_size=24,
return_timestamps=True,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
- model_kwargs={"use_flash_attention_2": True},
generate_kwargs={"language": "english"},
)
Please refer to the model card's snippets for running inference, e.g:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "distil-whisper/distil-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
torch_dtype=torch_dtype,
device=device,
)
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = dataset[0]["audio"]
result = pipe(sample)
print(result["text"])
That you can find here
@younesbelkada ,
Thank you very much for your support. Problem is solved ❤️
How do I add a stream feature to the Whisper model? I found this but it didn't work on my mac. I also want to upload .mp3 file instead of mic.
System Info
Who can help?
@Narsil , @SunMarc , @younesbelkada, @sanchit-gandhi
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
Expected behavior
I am the developer of the WhisperPlus project. I'm running the whisper model using the Transformers library. I want to do more optimization. This code works in Colab. But it gives an error on the runpod platform.