coqui-ai / TTS

🐸💬 - a deep learning toolkit for Text-to-Speech, battle-tested in research and production
http://coqui.ai
Mozilla Public License 2.0
35.09k stars 4.28k forks source link

[Bug] xtts ft demo: empty csv files with the format_audio_list #3481

Closed dorbodwolf closed 8 months ago

dorbodwolf commented 10 months ago

Describe the bug

I use the formatter method to process my audio files(Chinese language), but I got the csv files with no data. Because it has never met the condition of if word.word[-1] in ["!", ".", "?"]:

To Reproduce

below is my code:

datapath = "/mnt/workspace/tdy.tdy/mp3_lww"
out_path = "/mnt/workspace/tdy.tdy/mp3_lww_train"
os.makedirs(out_path, exist_ok=True)
whisper_path = "/mnt/workspace/.cache/modelscope/keepitsimple/faster-whisper-large-v3"
target_language = 'zh'
buffer=0.2
eval_percentage=0.15
speaker_name="lww"

import os
from os import path as osp
import torchaudio
from matplotlib import pyplot as plt
import torch
from faster_whisper import WhisperModel
import pandas
import gc

# Loading Whisper
device = "cuda" if torch.cuda.is_available() else "cpu" 
print("Loading Whisper Model!")
asr_model = WhisperModel(whisper_path, device=device, compute_type="float16", local_files_only=True)

def plot_waveform(waveform, sample_rate):
    waveform = waveform.numpy()

    num_channels, num_frames = waveform.shape
    time_axis = torch.arange(0, num_frames) / sample_rate

    figure, axes = plt.subplots(num_channels, 1)
    if num_channels == 1:
        axes = [axes]
    for c in range(num_channels):
        axes[c].plot(time_axis, waveform[c], linewidth=1)
        axes[c].grid(True)
        if num_channels > 1:
            axes[c].set_ylabel(f"Channel {c+1}")
    figure.suptitle("waveform")

print("Reading audio files!")
audio_files = os.listdir(datapath)
audio_total_size = 0
metadata = {"audio_file": [], "text": [], "speaker_name": []}
for f in audio_files:
    if f.endswith('mp3'):
        audio_path = osp.join(datapath, f)
        wav, sr = torchaudio.load(audio_path)
        if wav.size(0) != 1:
            wav = torch.mean(wav, dim=0, keepdim=True)
        wav = wav.squeeze()
        audio_total_size += (wav.size(-1) / sr)
        # plot_waveform(wav, sr)
        segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
        segments = list(segments)
        i = 0
        sentence = ""
        sentence_start = None
        first_word = True
        # added all segments words in a unique list
        words_list = []
        for _, segment in enumerate(segments):
            words = list(segment.words)
            words_list.extend(words)

        # process each word
        for word_idx, word in enumerate(words_list):
            if first_word:
                sentence_start = word.start
                # If it is the first sentence, add buffer or get the begining of the file
                if word_idx == 0:
                    sentence_start = max(sentence_start - buffer, 0)  # Add buffer to the sentence start
                else:
                    # get previous sentence end
                    previous_word_end = words_list[word_idx - 1].end
                    # add buffer or get the silence midle between the previous sentence and the current one
                    sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start)/2)

                sentence = word.word
                first_word = False
            else:
                sentence += word.word

            if word.word[-1] in ["!", ".", "?"]:
                sentence = sentence[1:]
                # Expand number and abbreviations plus normalization
                sentence = multilingual_cleaners(sentence, target_language)
                audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))

                audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"

                # Check for the next word's existence
                if word_idx + 1 < len(words_list):
                    next_word_start = words_list[word_idx + 1].start
                else:
                    # If don't have more words it means that it is the last sentence then use the audio len as next word start
                    next_word_start = (wav.shape[0] - 1) / sr

                # Average the current word end and next word start
                word_end = min((word.end + next_word_start) / 2, word.end + buffer)

                absoulte_path = os.path.join(out_path, audio_file)
                os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
                i += 1
                first_word = True

                audio = wav[int(sr*sentence_start):int(sr*word_end)].unsqueeze(0)
                # if the audio is too short ignore it (i.e < 0.33 seconds)
                if audio.size(-1) >= sr/3:
                    torchaudio.save(absoulte_path,
                        audio,
                        sr
                    )
                else:
                    continue

                metadata["audio_file"].append(audio_file)
                metadata["text"].append(sentence)
                metadata["speaker_name"].append(speaker_name)

df = pandas.DataFrame(metadata)
df = df.sample(frac=1)
num_val_samples = int(len(df)*eval_percentage)

df_eval = df[:num_val_samples]
df_train = df[num_val_samples:]

df_train = df_train.sort_values('audio_file')
train_metadata_path = os.path.join(out_path, "metadata_train.csv")
df_train.to_csv(train_metadata_path, sep="|", index=False)

eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
df_eval = df_eval.sort_values('audio_file')
df_eval.to_csv(eval_metadata_path, sep="|", index=False)

# deallocate VRAM and RAM
del asr_model, df_train, df_eval, df, metadata
gc.collect()

print('audio total size: ', audio_total_size)

Expected behavior

there are data lines in metadata_train.csv and metadata_eval.csv

Logs

root@dsw-297768-d54489667-bcrfv:/mnt/workspace/clone_voice_sft_xtts# python process_audio_files.py 
2023-12-31 21:37:21,419 - modelscope - INFO - PyTorch version 2.1.0+cu118 Found.
2023-12-31 21:37:21,421 - modelscope - INFO - TensorFlow version 2.14.0 Found.
2023-12-31 21:37:21,421 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2023-12-31 21:37:21,462 - modelscope - INFO - Loading done! Current index file version is 1.10.0, with md5 44f0b88effe82ceea94a98cf99709694 and a total number of 946 components indexed
Loading Whisper Model!
/mnt/workspace/.cache/modelscope/keepitsimple/faster-whisper-large-v3
Reading audio files!
> /mnt/workspace/clone_voice_sft_xtts/process_audio_files.py(82)<module>()
-> if word.word[-1] in ["!", ".", "?"]:
(Pdb) words_list
[Word(start=0.0, end=0.42, word='但', probability=0.82470703125), Word(start=0.42, end=0.68, word='小', probability=0.9951171875), Word(start=0.68, end=1.06, word='狗', probability=0.99951171875), Word(start=1.06, end=1.18, word='呢', probability=0.8623046875), Word(start=1.18, end=1.34, word='它', probability=0.4169921875), Word(start=1.34, end=1.6, word='不是', probability=0.9970703125), Word(start=1.6, end=1.9, word='关', probability=0.904296875), Word(start=1.9, end=2.2, word='节', probability=0.99853515625), Word(start=2.2, end=2.38, word='它', probability=0.91015625), Word(start=2.38, end=2.64, word='是', probability=0.99951171875), Word(start=2.64, end=3.0, word='近', probability=0.362548828125), Word(start=3.0, end=3.72, word='病', probability=0.80419921875), Word(start=3.72, end=4.08, word='骨', probability=0.99072265625), Word(start=4.08, end=4.72, word='就', probability=0.9921875), Word(start=4.72, end=4.86, word='它', probability=0.9794921875), Word(start=4.86, end=5.16, word='病', probability=0.9990234375), Word(start=5.16, end=5.44, word='骨', probability=1.0), Word(start=5.44, end=5.6, word='和', probability=0.9990234375), Word(start=5.6, end=5.72, word='它', probability=0.99755859375), Word(start=5.72, end=6.0, word='那个', probability=0.99658203125), Word(start=6.0, end=6.24, word='什么', probability=0.994140625), Word(start=6.979999999999997, end=7.5, word='骨', probability=0.99853515625), Word(start=7.5, end=7.76, word='头', probability=1.0), Word(start=7.76, end=7.92, word='的', probability=1.0), Word(start=7.92, end=8.06, word='那个', probability=0.998046875), Word(start=8.06, end=8.26, word='位', probability=1.0), Word(start=8.26, end=8.54, word='置', probability=1.0), Word(start=8.54, end=8.84, word='它', probability=0.99560546875), Word(start=8.84, end=9.1, word='是', probability=1.0), Word(start=9.1, end=9.3, word='那个', probability=1.0), Word(start=9.3, end=9.74, word='地方', probability=1.0), Word(start=9.74, end=10.12, word='没', probability=0.9990234375), Word(start=10.12, end=10.32, word='长', probability=0.998046875), Word(start=10.32, end=10.66, word='好', probability=0.99951171875), Word(start=10.66, end=11.64, word='然后', probability=0.99853515625), Word(start=11.64, end=12.28, word='长', probability=0.99951171875), Word(start=12.28, end=12.7, word='期', probability=1.0), Word(start=12.7, end=12.86, word='那么', probability=0.9892578125), Word(start=12.86, end=13.16, word='走', probability=1.0), Word(start=13.16, end=13.4, word='路', probability=1.0), Word(start=13.4, end=13.52, word='呢', probability=0.990234375), Word(start=13.52, end=13.84, word='磨', probability=0.998291015625), Word(start=13.84, end=14.16, word='损', probability=0.999755859375), Word(start=14.16, end=14.5, word='导', probability=0.99951171875), Word(start=14.5, end=14.78, word='致', probability=1.0), Word(start=14.78, end=14.94, word='的', probability=0.98876953125), Word(start=14.94, end=15.92, word='就', probability=0.98681640625), Word(start=15.92, end=16.08, word='反', probability=1.0), Word(start=16.08, end=16.26, word='正', probability=1.0), Word(start=16.26, end=16.48, word='原', probability=0.9990234375), Word(start=16.48, end=16.62, word='理', probability=0.99755859375), Word(start=16.62, end=16.74, word='应', probability=0.99951171875), Word(start=16.74, end=16.84, word='该', probability=1.0), Word(start=16.84, end=16.96, word='都是', probability=1.0), Word(start=16.96, end=17.42, word='差不多', probability=0.99951171875), Word(start=17.42, end=17.7, word='反', probability=1.0), Word(start=17.7, end=17.84, word='正', probability=1.0), Word(start=17.84, end=18.08, word='就是', probability=1.0), Word(start=18.9, end=19.42, word='用', probability=0.99951171875), Word(start=19.42, end=19.7, word='力', probability=1.0), Word(start=19.7, end=19.86, word='用', probability=0.9990234375), Word(start=19.86, end=20.2, word='不对', probability=0.9990234375), Word(start=20.2, end=20.7, word='然后', probability=0.998046875), Word(start=20.7, end=21.68, word='导', probability=0.99951171875), Word(start=21.68, end=21.92, word='致', probability=1.0), Word(start=21.92, end=22.12, word='那个', probability=0.99658203125), Word(start=22.12, end=22.46, word='膝', probability=0.983154296875), Word(start=22.46, end=22.7, word='关', probability=0.99853515625), Word(start=22.7, end=22.96, word='节', probability=0.99951171875), Word(start=22.96, end=23.86, word='的', probability=0.99560546875), Word(start=23.86, end=24.04, word='那个', probability=0.9990234375), Word(start=24.04, end=24.36, word='白', probability=1.0), Word(start=24.36, end=24.66, word='色', probability=1.0), Word(start=24.66, end=24.74, word='的', probability=0.966796875), Word(start=24.74, end=24.9, word='那个', probability=0.97119140625), Word(start=24.9, end=25.22, word='软', probability=0.999267578125), Word(start=25.22, end=25.48, word='骨', probability=0.999755859375), Word(start=25.48, end=25.68, word='啊', probability=0.962890625), Word(start=25.68, end=26.58, word='就', probability=0.99853515625), Word(start=26.58, end=27.16, word='磨', probability=0.999755859375), Word(start=27.16, end=27.42, word='损', probability=1.0), Word(start=27.42, end=27.58, word='的', probability=0.9775390625), Word(start=27.58, end=27.72, word='太', probability=0.9990234375), Word(start=27.72, end=27.92, word='严', probability=0.999755859375), Word(start=27.92, end=28.16, word='重', probability=1.0), Word(start=28.16, end=28.26, word='了', probability=0.97509765625), Word(start=28.26, end=29.26, word='然后', probability=0.99560546875), Word(start=29.26, end=29.54, word='呢', probability=1.0), Word(start=29.54, end=29.82, word='现在', probability=0.38525390625), Word(start=29.82, end=30.08, word='呢', probability=0.283203125), Word(start=30.08, end=30.92, word='它', probability=0.1630859375), Word(start=30.92, end=31.16, word='走', probability=0.9970703125), Word(start=31.16, end=31.44, word='路', probability=0.99951171875), Word(start=31.44, end=31.52, word='呢', probability=0.89697265625), Word(start=31.52, end=31.74, word='它', probability=0.9326171875), Word(start=31.74, end=31.94, word='是', probability=0.98681640625), Word(start=31.94, end=32.18, word='骨', probability=0.991943359375), Word(start=32.18, end=32.38, word='头', probability=0.9970703125), Word(start=32.38, end=32.64, word='磨', probability=0.907470703125), Word(start=32.64, end=32.74, word='着', probability=0.76904296875), Word(start=32.74, end=32.96, word='骨', probability=0.994873046875), Word(start=32.96, end=33.2, word='头', probability=0.99951171875), Word(start=33.2, end=33.48, word='所以', probability=0.96240234375), Word(start=33.48, end=33.58, word='就', probability=0.990234375), Word(start=33.58, end=33.72, word='会', probability=0.99853515625), Word(start=33.72, end=33.94, word='很', probability=0.9990234375), Word(start=33.94, end=34.26, word='疼', probability=0.994384765625), Word(start=34.26, end=34.96, word='或者', probability=0.98193359375), Word(start=34.96, end=35.2, word='是', probability=0.9990234375), Word(start=35.2, end=35.76, word='那个', probability=0.79638671875), Word(start=35.76, end=37.32, word='软', probability=0.997314453125), Word(start=37.32, end=37.58, word='骨', probability=0.9990234375), Word(start=37.58, end=37.68, word='比', probability=0.98974609375), Word(start=37.68, end=38.1, word='较', probability=1.0), Word(start=38.92, end=38.94, word='比', probability=0.4990234375), Word(start=38.94, end=39.34, word='较', probability=1.0), Word(start=39.34, end=39.64, word='薄', probability=1.0), Word(start=39.64, end=39.78, word='了', probability=0.9990234375), Word(start=39.78, end=40.22, word='所以', probability=0.99658203125), Word(start=40.22, end=40.38, word='它', probability=0.96826171875), Word(start=40.38, end=40.56, word='就', probability=0.99755859375), Word(start=40.56, end=41.24, word='不能', probability=0.998046875), Word(start=41.24, end=41.66, word='缓', probability=0.99951171875), Word(start=41.66, end=41.98, word='冲', probability=0.99853515625), Word(start=41.98, end=42.62, word='所以', probability=0.99072265625), Word(start=42.62, end=42.78, word='就', probability=0.99951171875), Word(start=42.78, end=42.9, word='比', probability=0.99951171875), Word(start=42.9, end=43.08, word='较', probability=1.0), Word(start=43.08, end=43.4, word='疼', probability=0.999755859375)]
(Pdb) len(words_list)
129
(Pdb) words_list[0]
Word(start=0.0, end=0.42, word='但', probability=0.82470703125)
(Pdb) q
Traceback (most recent call last):
  File "/mnt/workspace/clone_voice_sft_xtts/process_audio_files.py", line 82, in <module>
    sentence = sentence[1:]
  File "/mnt/workspace/clone_voice_sft_xtts/process_audio_files.py", line 82, in <module>
    sentence = sentence[1:]
  File "/opt/conda/lib/python3.10/bdb.py", line 90, in trace_dispatch
    return self.dispatch_line(frame)
  File "/opt/conda/lib/python3.10/bdb.py", line 115, in dispatch_line
    if self.quitting: raise BdbQuit
bdb.BdbQuit
^[[A

Environment

{
    "CUDA": {
        "GPU": [
            "Tesla V100-SXM2-16GB"
        ],
        "available": true,
        "version": "11.8"
    },
    "Packages": {
        "PyTorch_debug": false,
        "PyTorch_version": "2.1.0+cu118",
        "numpy": "1.26.2"
    },
    "System": {
        "OS": "Linux",
        "architecture": [
            "64bit",
            "ELF"
        ],
        "processor": "x86_64",
        "python": "3.10.13",
        "version": "#1 SMP Tue Jun 20 06:15:49 UTC 2023"
    }
}

Additional context

I installed TTS by this:

rm -rf TTS/ # delete repo to be able to reinstall if needed
git clone --branch xtts_demo  https://github.com/coqui-ai/TTS.git
pip install --use-deprecated=legacy-resolver  -e TTS
pip install --use-deprecated=legacy-resolver  -r TTS/TTS/demos/xtts_ft_demo/requirements.txt
pip install typing_extensions==4.8.0 numpy==1.26.2
stale[bot] commented 9 months ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. You might also look our discussion channels.