Open allandclive opened 10 months ago
Hey @allandclive , it's on my TODO. Have you been able to run the code on your own?
@ylacombe The code works perfectly for me and i can see its doing fine on wandb, however, during inference i get the error "RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)"
Do note that instead of HF, I'm loading it from local.
import os
import subprocess
from transformers import pipeline
import scipy
local_model_path = "./trained_models_dhivehi/"
synthesiser = pipeline("text-to-speech", model=local_model_path) # add device=0 if you want to use a GPU
def uromanize(input_string, uroman_path):
"""Convert non-Roman strings to Roman using the `uroman` perl package."""
script_path = os.path.join(uroman_path, "bin", "uroman.pl")
command = ["perl", script_path]
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Execute the perl command
stdout, stderr = process.communicate(input=input_string.encode())
if process.returncode != 0:
raise ValueError(f"Error {process.returncode}: {stderr.decode()}")
# Return the output as a string and skip the new-line character at the end
return stdout.decode()[:-1]
text = "އެމްޑީޕީގެ މާލެއަށް ހާއްސަ މެނިފެސްޓޯ ފަހި ތަނަވަސް މާލެ"
uromanized_text = uromanize(text, uroman_path="../uroman")
speech = synthesiser(uromanized_text)
scipy.io.wavfile.write("finetuned_output.wav", rate=speech["sampling_rate"], data=speech["audio"])
Turns out the error occurs in WSL when i moved the model to base windows and tried it worked fine without some minor modifications for normalization. Maybe update the Readme
import os
import subprocess
from transformers import pipeline
import scipy.io.wavfile
import numpy as np
local_model_path = "./trained_models_dhivehi/"
synthesiser = pipeline("text-to-speech", model=local_model_path)
def uromanize(input_string, uroman_path):
"""Convert non-Roman strings to Roman using the `uroman` perl package."""
script_path = os.path.join(uroman_path, "uroman.pl")
command = ["perl", script_path]
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Execute the perl command
stdout, stderr = process.communicate(input=input_string.encode())
if process.returncode != 0:
raise ValueError(f"Error {process.returncode}: {stderr.decode()}")
return stdout.decode()[:-1]
text = "ހޯދުމުގެ މަސައްކަތް ވަރަށް ބާރަށް ކުރިއަށްދާކަމަށް އެފޯރަމްގެ ފަރާތުން ވަނީ ވިދާޅުވެފައެ"
uromanized_text = uromanize(text, uroman_path="../uroman/bin")
speech = synthesiser(text)
sampling_rate = speech["sampling_rate"]
if sampling_rate > 65535:
raise ValueError(f"Sampling rate {sampling_rate} exceeds the maximum allowed value.")
# Ensure the audio data is in the correct format
audio_data = speech["audio"]
if not isinstance(audio_data, np.ndarray):
audio_data = np.array(audio_data)
# Normalize and convert the audio data to int16 if necessary
if audio_data.dtype != np.int16:
audio_data = np.int16(audio_data / np.max(np.abs(audio_data)) * 32767)
# Reshape audio data for mono
if len(audio_data.shape) == 1:
audio_data = audio_data.reshape(-1, 1)
# Re-check and print the properties of the actual audio data
print(f"Actual Audio Data Shape: {audio_data.shape}")
print(f"Actual Audio Data Type: {audio_data.dtype}")
print(f"Max value in Audio Data: {np.max(audio_data)}")
print(f"Min value in Audio Data: {np.min(audio_data)}")
try:
# Ensure mono audio is correctly formatted as 2D array
if audio_data.shape[1] != 2:
audio_data = audio_data.reshape(-1, 1)
scipy.io.wavfile.write("finetuned_output.wav", rate=sampling_rate, data=audio_data)
print("Actual audio written successfully.")
except Exception as e:
print(f"Error writing actual audio: {e}")
It will be helpful to have an easy to follow Colab code & tutorial or Gradio UI with easy fine tuning