[torch-neuron] Wav2Vec2 model inference support on Inf1

This is now possible to compile as of the 2.5.0 release of Neuron: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/prev/rn.html#neuron-2-5-0-11-23-2022

The following dependencies can be used with the script below:

pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com --upgrade "neuron-cc[tensorflow]" torch-neuron torchaudio transformers datasets librosa pandas==1.4.4

This uses an older version of pandas for tensorflow/numpy compatibility.

The script may also require installing system libraries for loading audio. For example, on Ubuntu:

sudo apt-get install libsndfile-dev ffmpeg

The following code will compile the model for Neuron and perform an inference:

import os

from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Config,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor
)
from datasets import load_dataset
import datasets
import torch
import torch_neuron

# Parameters
batch_size = 1
sampling_rate = 16000
max_seconds = 10
max_length = max_seconds * sampling_rate
name = "facebook/wav2vec2-base-960h"

def trace():

    filename = 'wav2vec2-neuron.pt'
    if os.path.exists(filename):
        return torch.jit.load(filename)

    model = Wav2Vec2ForCTC.from_pretrained(name, torchscript=True)
    model.eval()

    example = torch.ones(1, max_length)

    neuron = torch_neuron.trace(model, example)
    torch.jit.save(neuron, filename)
    return neuron

def infer(model, number_of_inferences=10):

    config = Wav2Vec2Config.from_pretrained(name) # Used for inputs_to_logits_ratio
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(name)
    feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(name)

    # load first sample of English common_voice
    dataset = load_dataset("common_voice", "en", split="train", streaming=True)
    dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=sampling_rate))

    for i, sample in enumerate(iter(dataset)):

        if i > number_of_inferences:
            return

        features = feature_extractor(
            sample["audio"]["array"],
            max_length=max_length,
            sampling_rate=sampling_rate,
            padding='max_length',
            return_tensors="pt"
        )

        result = model(features.input_values)
        logits = result[0][0] # NOTE: Assuming batch size 1

        pred_ids = torch.argmax(logits, axis=-1) # Note: Can be executed on inferentia

        # retrieve word stamps (analogous commands for `output_char_offsets`)
        outputs = tokenizer.decode(pred_ids, output_word_offsets=True)

        # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
        time_offset = config.inputs_to_logits_ratio / feature_extractor.sampling_rate

        word_offsets = [
            {
                "word": d["word"],
                "start_time": round(d["start_offset"] * time_offset, 2),
                "end_time": round(d["end_offset"] * time_offset, 2),
            }
            for d in outputs.word_offsets
        ]
        # compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer:
        # https://huggingface.co/datasets/common_voice/viewer/en/train
        print(word_offsets[:3])

if __name__ == '__main__':
    model_neuron = trace()
    model_cpu = Wav2Vec2ForCTC.from_pretrained(name, torchscript=True).eval()

    print('Results Neuron:')
    infer(model_neuron)
    print('Results CPU:')
    infer(model_cpu)

aws-neuron / aws-neuron-sdk

[torch-neuron] Wav2Vec2 model inference support on Inf1 #306