ylacombe / finetune-hf-vits

Finetune VITS and MMS using HuggingFace's tools
MIT License
120 stars 26 forks source link

Colab Notebook & Tutorial #1

Open allandclive opened 10 months ago

allandclive commented 10 months ago

It will be helpful to have an easy to follow Colab code & tutorial or Gradio UI with easy fine tuning

ylacombe commented 10 months ago

Hey @allandclive , it's on my TODO. Have you been able to run the code on your own?

dunkerbunker commented 10 months ago

@ylacombe The code works perfectly for me and i can see its doing fine on wandb, however, during inference i get the error "RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)"

Do note that instead of HF, I'm loading it from local.

import os
import subprocess
from transformers import pipeline
import scipy

local_model_path = "./trained_models_dhivehi/"
synthesiser = pipeline("text-to-speech", model=local_model_path)  # add device=0 if you want to use a GPU

def uromanize(input_string, uroman_path):
    """Convert non-Roman strings to Roman using the `uroman` perl package."""
    script_path = os.path.join(uroman_path, "bin", "uroman.pl")

    command = ["perl", script_path]

    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    # Execute the perl command
    stdout, stderr = process.communicate(input=input_string.encode())

    if process.returncode != 0:
        raise ValueError(f"Error {process.returncode}: {stderr.decode()}")

    # Return the output as a string and skip the new-line character at the end
    return stdout.decode()[:-1]

text = "އެމްޑީޕީގެ މާލެއަށް ހާއްސަ މެނިފެސްޓޯ ފަހި ތަނަވަސް މާލެ"
uromanized_text = uromanize(text, uroman_path="../uroman")

speech = synthesiser(uromanized_text)

scipy.io.wavfile.write("finetuned_output.wav", rate=speech["sampling_rate"], data=speech["audio"])
dunkerbunker commented 10 months ago

Turns out the error occurs in WSL when i moved the model to base windows and tried it worked fine without some minor modifications for normalization. Maybe update the Readme

import os
import subprocess
from transformers import pipeline
import scipy.io.wavfile
import numpy as np

local_model_path = "./trained_models_dhivehi/"
synthesiser = pipeline("text-to-speech", model=local_model_path)

def uromanize(input_string, uroman_path):
    """Convert non-Roman strings to Roman using the `uroman` perl package."""
    script_path = os.path.join(uroman_path, "uroman.pl")

    command = ["perl", script_path]

    process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    # Execute the perl command
    stdout, stderr = process.communicate(input=input_string.encode())

    if process.returncode != 0:
        raise ValueError(f"Error {process.returncode}: {stderr.decode()}")

    return stdout.decode()[:-1]

text = "ހޯދުމުގެ މަސައްކަތް ވަރަށް ބާރަށް ކުރިއަށްދާކަމަށް އެފޯރަމްގެ ފަރާތުން ވަނީ ވިދާޅުވެފައެ"
uromanized_text = uromanize(text, uroman_path="../uroman/bin")

speech = synthesiser(text)

sampling_rate = speech["sampling_rate"]
if sampling_rate > 65535:
    raise ValueError(f"Sampling rate {sampling_rate} exceeds the maximum allowed value.")

# Ensure the audio data is in the correct format
audio_data = speech["audio"]
if not isinstance(audio_data, np.ndarray):
    audio_data = np.array(audio_data)

# Normalize and convert the audio data to int16 if necessary
if audio_data.dtype != np.int16:
    audio_data = np.int16(audio_data / np.max(np.abs(audio_data)) * 32767)

# Reshape audio data for mono
if len(audio_data.shape) == 1:
    audio_data = audio_data.reshape(-1, 1)

# Re-check and print the properties of the actual audio data
print(f"Actual Audio Data Shape: {audio_data.shape}")
print(f"Actual Audio Data Type: {audio_data.dtype}")
print(f"Max value in Audio Data: {np.max(audio_data)}")
print(f"Min value in Audio Data: {np.min(audio_data)}")

try:
    # Ensure mono audio is correctly formatted as 2D array
    if audio_data.shape[1] != 2:
        audio_data = audio_data.reshape(-1, 1)

    scipy.io.wavfile.write("finetuned_output.wav", rate=sampling_rate, data=audio_data)
    print("Actual audio written successfully.")
except Exception as e:
    print(f"Error writing actual audio: {e}")