result = pipe("some_audio.wav", generate_kwargs={"task": "transcribe"}) # same issue arises when using mp3 file
print(result["text"])
results in the following crash:
Traceback (most recent call last):
File "/Users/hugo/programming/music_genre_classification/src/bug.py", line 32, in
result = pipe("../../whisper/thomas.mp3")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/automatic_speech_recognition.py", line 285, in call
return super().call(inputs, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/base.py", line 1235, in call
return next(
^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/pt_utils.py", line 124, in next
item = next(self.iterator)
^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/pt_utils.py", line 269, in next
processed = self.infer(next(self.iterator), self.params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/base.py", line 1150, in forward
model_outputs = self._forward(model_inputs, *forward_params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/automatic_speech_recognition.py", line 508, in _forward
tokens = self.model.generate(
^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/models/whisper/generation_whisper.py", line 578, in generate
outputs = super().generate(
^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/generation/utils.py", line 1758, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/generation/utils.py", line 2410, in _sample
next_token_scores = logits_processor(input_ids, next_token_logits)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/generation/logits_process.py", line 98, in call
scores = processor(input_ids, scores)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/generation/logits_process.py", line 1784, in call
suppress_token_mask = torch.isin(vocab_tensor, self.begin_suppress_tokens)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
NotImplementedError: The operator 'aten::isin.Tensor_Tensor_out' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable PYTORCH_ENABLE_MPS_FALLBACK=1 to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.
### Expected behavior
The expected output is the following sentence, followed by a transcription of the soundfile.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
System Info
setup with crash
transformers
version: 4.41.2working setup
transformers
version: 4.39.0Who can help?
No response
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
device = "mps" torch_dtype = torch.float32
model_id = "openai/whisper-tiny"
model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, ) model.to(device)
processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, return_timestamps=True, torch_dtype=torch_dtype, device=device, )
result = pipe("some_audio.wav", generate_kwargs={"task": "transcribe"}) # same issue arises when using mp3 file print(result["text"])
Traceback (most recent call last): File "/Users/hugo/programming/music_genre_classification/src/bug.py", line 32, in
result = pipe("../../whisper/thomas.mp3")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/automatic_speech_recognition.py", line 285, in call
return super().call(inputs, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/base.py", line 1235, in call
return next(
^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/pt_utils.py", line 124, in next
item = next(self.iterator)
^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/pt_utils.py", line 269, in next
processed = self.infer(next(self.iterator), self.params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/base.py", line 1150, in forward
model_outputs = self._forward(model_inputs, *forward_params)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/pipelines/automatic_speech_recognition.py", line 508, in _forward
tokens = self.model.generate(
^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/models/whisper/generation_whisper.py", line 578, in generate
outputs = super().generate(
^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/generation/utils.py", line 1758, in generate
result = self._sample(
^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/generation/utils.py", line 2410, in _sample
next_token_scores = logits_processor(input_ids, next_token_logits)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/generation/logits_process.py", line 98, in call
scores = processor(input_ids, scores)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/hugo/programming/music_genre_classification/env/lib/python3.12/site-packages/transformers/generation/logits_process.py", line 1784, in call
suppress_token_mask = torch.isin(vocab_tensor, self.begin_suppress_tokens)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
NotImplementedError: The operator 'aten::isin.Tensor_Tensor_out' is not currently implemented for the MPS device. If you want this op to be added in priority during the prototype phase of this feature, please comment on https://github.com/pytorch/pytorch/issues/77764. As a temporary fix, you can set the environment variable
PYTORCH_ENABLE_MPS_FALLBACK=1
to use the CPU as a fallback for this op. WARNING: this will be slower than running natively on MPS.