Open mshbaita-jo opened 9 months ago
An example.... NB: make sure your logged in the hugginface cli (shell/terminal)
from datasets import Dataset, Audio
from huggingface_hub import HfApi, HfFolder, create_repo
# Path to the folder containing .wav files
input_folder = 'New Sound Recordings'
# Path to the translated text file
text_file_path = 'translated_texts_good.txt'
# Load texts from the file into a dictionary with number IDs
sample_texts = {}
with open(text_file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if line: # Check if the line is not empty
# Split on the first occurrence of '. ' after the number
parts = line.split('. ', 1) # Note the two spaces after the period
if len(parts) > 1:
# Use the first part as the key (number) and the second as the text
sample_texts[int(parts[0])] = parts[1].strip()
# Create dataset entries based on available wav files
data = []
for filename in sorted(os.listdir(input_folder)):
if filename.endswith('.wav'):
# Extract the number from the filename (e.g., 1.wav)
number = int(filename.split('.')[0])
if number in sample_texts:
file_path = os.path.join(input_folder, filename)
line_id = f"BI{number:04d}" # Generate a unique line_id
text = sample_texts[number] # Get the corresponding text
speaker_id = 1 # Assuming one speaker
entry = {
'line_id': line_id,
'audio': file_path, # Just use the file path
'text': text,
'speaker_id': speaker_id
}
data.append(entry)
# Create Dataset
dataset = Dataset.from_list(data)
# Add audio feature to the dataset
dataset = dataset.cast_column("audio", Audio())
# Upload to Hugging Face
hf_token = HfFolder.get_token() # Ensure you have logged in using `huggingface-cli login`
api = HfApi()
# Create the repository if it doesn't exist
try:
create_repo(repo_id="Benjamin-png/dataset_dr", repo_type="dataset", token=hf_token)
print("Repository created successfully.")
except Exception as e:
print(f"Repository creation failed or already exists: {e}")
# Push the entire dataset to Hugging Face
dataset.push_to_hub("Benjamin-png/dataset_dr", token=hf_token)
print("Dataset uploaded successfully!")
how can I upload my own dataset in hugging face in the same way that you upload yours? I mean the audio and metadata in one file