ylacombe / finetune-hf-vits

Finetune VITS and MMS using HuggingFace's tools
MIT License
122 stars 28 forks source link

uploading dataset to hugging face #11

Open mshbaita-jo opened 9 months ago

mshbaita-jo commented 9 months ago

how can I upload my own dataset in hugging face in the same way that you upload yours? I mean the audio and metadata in one file

ylacombe commented 9 months ago

See this comment ! https://github.com/ylacombe/finetune-hf-vits/issues/8#issuecomment-1941271588

benny-png commented 3 months ago

An example.... NB: make sure your logged in the hugginface cli (shell/terminal)

from datasets import Dataset, Audio
from huggingface_hub import HfApi, HfFolder, create_repo

# Path to the folder containing .wav files
input_folder = 'New Sound Recordings'

# Path to the translated text file
text_file_path = 'translated_texts_good.txt'

# Load texts from the file into a dictionary with number IDs
sample_texts = {}
with open(text_file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        if line:  # Check if the line is not empty
            # Split on the first occurrence of '. ' after the number
            parts = line.split('.  ', 1)  # Note the two spaces after the period
            if len(parts) > 1:
                # Use the first part as the key (number) and the second as the text
                sample_texts[int(parts[0])] = parts[1].strip()

# Create dataset entries based on available wav files
data = []
for filename in sorted(os.listdir(input_folder)):
    if filename.endswith('.wav'):
        # Extract the number from the filename (e.g., 1.wav)
        number = int(filename.split('.')[0])
        if number in sample_texts:
            file_path = os.path.join(input_folder, filename)
            line_id = f"BI{number:04d}"  # Generate a unique line_id
            text = sample_texts[number]  # Get the corresponding text
            speaker_id = 1  # Assuming one speaker

            entry = {
                'line_id': line_id,
                'audio': file_path,  # Just use the file path
                'text': text,
                'speaker_id': speaker_id
            }
            data.append(entry)

# Create Dataset
dataset = Dataset.from_list(data)

# Add audio feature to the dataset
dataset = dataset.cast_column("audio", Audio())

# Upload to Hugging Face
hf_token = HfFolder.get_token()  # Ensure you have logged in using `huggingface-cli login`
api = HfApi()

# Create the repository if it doesn't exist
try:
    create_repo(repo_id="Benjamin-png/dataset_dr", repo_type="dataset", token=hf_token)
    print("Repository created successfully.")
except Exception as e:
    print(f"Repository creation failed or already exists: {e}")

# Push the entire dataset to Hugging Face
dataset.push_to_hub("Benjamin-png/dataset_dr", token=hf_token)

print("Dataset uploaded successfully!")