Closed aws-maens closed 1 year ago
This is now possible to compile as of the 2.5.0 release of Neuron: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/prev/rn.html#neuron-2-5-0-11-23-2022
The following dependencies can be used with the script below:
pip install --extra-index-url=https://pip.repos.neuron.amazonaws.com --upgrade "neuron-cc[tensorflow]" torch-neuron torchaudio transformers datasets librosa pandas==1.4.4
This uses an older version of pandas
for tensorflow
/numpy
compatibility.
The script may also require installing system libraries for loading audio. For example, on Ubuntu:
sudo apt-get install libsndfile-dev ffmpeg
The following code will compile the model for Neuron and perform an inference:
import os
from transformers import (
Wav2Vec2ForCTC,
Wav2Vec2Config,
Wav2Vec2CTCTokenizer,
Wav2Vec2FeatureExtractor
)
from datasets import load_dataset
import datasets
import torch
import torch_neuron
# Parameters
batch_size = 1
sampling_rate = 16000
max_seconds = 10
max_length = max_seconds * sampling_rate
name = "facebook/wav2vec2-base-960h"
def trace():
filename = 'wav2vec2-neuron.pt'
if os.path.exists(filename):
return torch.jit.load(filename)
model = Wav2Vec2ForCTC.from_pretrained(name, torchscript=True)
model.eval()
example = torch.ones(1, max_length)
neuron = torch_neuron.trace(model, example)
torch.jit.save(neuron, filename)
return neuron
def infer(model, number_of_inferences=10):
config = Wav2Vec2Config.from_pretrained(name) # Used for inputs_to_logits_ratio
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(name)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(name)
# load first sample of English common_voice
dataset = load_dataset("common_voice", "en", split="train", streaming=True)
dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=sampling_rate))
for i, sample in enumerate(iter(dataset)):
if i > number_of_inferences:
return
features = feature_extractor(
sample["audio"]["array"],
max_length=max_length,
sampling_rate=sampling_rate,
padding='max_length',
return_tensors="pt"
)
result = model(features.input_values)
logits = result[0][0] # NOTE: Assuming batch size 1
pred_ids = torch.argmax(logits, axis=-1) # Note: Can be executed on inferentia
# retrieve word stamps (analogous commands for `output_char_offsets`)
outputs = tokenizer.decode(pred_ids, output_word_offsets=True)
# compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
time_offset = config.inputs_to_logits_ratio / feature_extractor.sampling_rate
word_offsets = [
{
"word": d["word"],
"start_time": round(d["start_offset"] * time_offset, 2),
"end_time": round(d["end_offset"] * time_offset, 2),
}
for d in outputs.word_offsets
]
# compare word offsets with audio `common_voice_en_100038.mp3` online on the dataset viewer:
# https://huggingface.co/datasets/common_voice/viewer/en/train
print(word_offsets[:3])
if __name__ == '__main__':
model_neuron = trace()
model_cpu = Wav2Vec2ForCTC.from_pretrained(name, torchscript=True).eval()
print('Results Neuron:')
infer(model_neuron)
print('Results CPU:')
infer(model_cpu)
https://github.com/aws/aws-neuron-sdk/issues/258 https://github.com/aws/aws-neuron-sdk/issues/293