Open wvinzh opened 6 months ago
Yes - you can do so with the following:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from datasets import load_dataset
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# load model + processor
model_id = "distil-whisper/distil-large-v2"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)
encoder = model.get_encoder()
processor = AutoProcessor.from_pretrained(model_id)
# load dataset
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = dataset[0]["audio"]["array"]
# preprocess inputs
input_features = processor(sample, return_tensors="pt").input_features
input_features = input_features.to(device, dtype=torch_dtype)
# forward pass to get encoder hidden states
with torch.no_grad():
encoder_hidden_states = encoder(input_features).last_hidden_state
I can only use its encoder to extract audio features, right? How should I use it? Could you provide an example