Using MMS_FA to get word timestamp from transcription, I get very strange result from spoken numbers (in english here below).
The numbers are neither detected and probabilities are very low for this words, all of them.
For example, using the simple audio "http://files.gladia.io/aligner/issue/date.wav" with awaited transcript "today is february eight two thousand nineteen", only "today is february" is detected. This is reproducible with other files including years.
This is code to simply reproduce it:
import math
import os
import requests
import torch
import torchaudio
import matplotlib.pyplot as plt
def run_example():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bundle = torchaudio.pipelines.MMS_FA
audio_url = 'http://files.gladia.io/aligner/issue/date.wav'
file_name = audio_url.split('/')[-1]
with open(os.path.join('.', file_name), 'wb') as file:
file.write(requests.get(audio_url).content)
emission = get_emission(file_name, bundle, device)
plot_emission(emission, file_name)
DICTIONARY = bundle.get_dict(star=None)
REVERSE_DICTIONARY = {v: k for k, v in DICTIONARY.items()}
for emis in emission[0]:
emis_list = emis.tolist()
max_value = max(emis_list)
corresponding_token = REVERSE_DICTIONARY[emis_list.index(max_value)]
corresponding_prob = math.exp(max_value)
print(corresponding_token, f"{corresponding_prob:.2f}")
def get_emission(audio_path: str, bundle, device: str) -> torch.Tensor:
waveform, _ = torchaudio.load(audio_path)
model = bundle.get_model(with_star=False).to(device)
with torch.inference_mode():
emission, _ = model(waveform.to(device))
return emission
def plot_emission(emission, filename):
emission_np = emission.cpu().numpy()
fig, ax = plt.subplots(figsize=(20,5))
im = ax.imshow(emission_np.T, aspect='auto', origin='lower', cmap='viridis')
ax.set_title("Frame-wise class probabilities")
ax.set_xlabel("Time")
ax.set_ylabel("Labels")
cbar = plt.colorbar(im)
cbar.set_label('Probability')
fig.tight_layout()
plt.savefig(f"./{filename}.png")
š Describe the bug
Using MMS_FA to get word timestamp from transcription, I get very strange result from spoken numbers (in english here below). The numbers are neither detected and probabilities are very low for this words, all of them.
For example, using the simple audio "http://files.gladia.io/aligner/issue/date.wav" with awaited transcript "today is february eight two thousand nineteen", only "today is february" is detected. This is reproducible with other files including years.
This is code to simply reproduce it:
Emission visualization:
Output:
Versions