OVModelForSpeechSeq2Seq fails with return_timestamps="word".

barolo commented 8 months ago

@helena-intel

This works:

from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, GenerationConfig, pipeline
from pathlib import Path
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
import openvino as ov
import json

model_id = "openai/whisper-small"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
generation_config = GenerationConfig.from_pretrained(model_id)

ov_config = {"CACHE_DIR": ""}
model_path = Path(model_id.replace('/', '_'))

if not model_path.exists():
    ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
        model_id, ov_config=ov_config, export=True, compile=False, load_in_8bit=False
    )
    ov_model.half()
    ov_model.save_pretrained(model_path)
else:
    ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
        model_path, ov_config=ov_config, compile=False
    )

ov_model.generation_config = generation_config

device = 'gpu'
ov_model.to(device)
ov_model.compile()

pipe = pipeline(
    "automatic-speech-recognition",
    model=ov_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=18,
)
result = pipe("./4.wav")

with open("sample.json", "w") as outfile:
    json.dump(result, outfile)
print(result["text"])

This doesn't (added return_timestamps="word" at line 40):

from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, GenerationConfig, pipeline
from pathlib import Path
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
import openvino as ov
import json

model_id = "openai/whisper-small"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
generation_config = GenerationConfig.from_pretrained(model_id)

ov_config = {"CACHE_DIR": ""}
model_path = Path(model_id.replace('/', '_'))

if not model_path.exists():
    ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
        model_id, ov_config=ov_config, export=True, compile=False, load_in_8bit=False
    )
    ov_model.half()
    ov_model.save_pretrained(model_path)
else:
    ov_model = OVModelForSpeechSeq2Seq.from_pretrained(
        model_path, ov_config=ov_config, compile=False
    )

ov_model.generation_config = generation_config

device = 'gpu'
ov_model.to(device)
ov_model.compile()

pipe = pipeline(
    "automatic-speech-recognition",
    model=ov_model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=18,
)
result = pipe("./4.wav", return_timestamps="word")
with open("sample.json", "w") as outfile:
    json.dump(result, outfile)
print(result["text"])

and fails with:

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino
Compiling the encoder to GPU ...
Compiling the decoder to GPU ...
Compiling the decoder to GPU ...
device must be of type <class 'str'> but got <class 'torch.device'> instead
Traceback (most recent call last):
  File "/run/media/greggy/1a4fd6d7-1f9d-42c6-9324-661804695013/D/owisp/./n2.py", line 44, in <module>
    result = pipe("./4.wav", return_timestamps="word")
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/greggy/.local/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py", line 292, in __call__
    return super().__call__(inputs, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/greggy/.local/lib/python3.11/site-packages/transformers/pipelines/base.py", line 1154, in __call__
    return next(
           ^^^^^
  File "/home/greggy/.local/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py", line 124, in __next__
    item = next(self.iterator)
           ^^^^^^^^^^^^^^^^^^^
  File "/home/greggy/.local/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py", line 266, in __next__
    processed = self.infer(next(self.iterator), **self.params)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/greggy/.local/lib/python3.11/site-packages/transformers/pipelines/base.py", line 1068, in forward
    model_outputs = self._forward(model_inputs, **forward_params)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/greggy/.local/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py", line 507, in _forward
    tokens = self.model.generate(
             ^^^^^^^^^^^^^^^^^^^^
  File "/home/greggy/.local/lib/python3.11/site-packages/optimum/intel/openvino/modeling_seq2seq.py", line 1018, in generate
    outputs["token_timestamps"] = self._extract_token_timestamps(
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: '_OVModelForWhisper' object has no attribute '_extract_token_timestamps'

Of course non-OpenVino, purely transformers script works too and returns word level timestamps successfully.

barolo commented 8 months ago

Noticed too late, that it's already assigned to @eaidova at OpenVino repo, this one is more precise though.

helena-intel commented 7 months ago

Issue in OpenVINO repo: https://github.com/openvinotoolkit/openvino/issues/22794 @eaidova could you have a look?

huggingface / optimum-intel

OVModelForSpeechSeq2Seq fails with return_timestamps="word". #561