Changing the seed does nothing.

arthurwolf commented 8 months ago

(Amazing work!)

It generates the exact same wav file no matter what seed I input, no variation (that I can notice).

Any idea why?

I'm using fast_inference.py, just modified enough to take command line parameters (including the seed):

# Import required modules for file and system operations
import os
import grp
import pwd
import shutil
import tempfile
import time
from pathlib import Path

# Import the random module
import random 

# Import argparse for command-line argument parsing
import argparse

# Import necessary libraries for audio processing and deep learning
import librosa
import torch
from huggingface_hub import snapshot_download

# Import specific classes and functions from the project's modules
from fam.llm.adapters import FlattenedInterleavedEncodec2Codebook
from fam.llm.decoders import EncodecDecoder
from fam.llm.fast_inference_utils import build_model, main
from fam.llm.inference import (
    EncodecDecoder,
    InferenceConfig,
    Model,
    TiltedEncodec,
    TrainedBPETokeniser,
    get_cached_embedding,
    get_cached_file,
    get_enhancer,
)
from fam.llm.utils import (
    check_audio_file,
    get_default_dtype,
    get_device,
    normalize_text,
)

# Define a function to parse command-line arguments
def parse_args():
    parser = argparse.ArgumentParser(description='Text-to-Speech using the specified model.')
    parser.add_argument('--text', type=str, required=True, help='Text to convert to speech.')
    parser.add_argument('--spk_ref_path', type=str, required=True, help='Path to speaker reference audio file. Minimum 30 seconds of audio required.')
    parser.add_argument('--top_p', type=float, default=0.95, help='Top p for sampling. Range [0.9, 1.0]. A measure of speech stability.')
    parser.add_argument('--guidance_scale', type=float, default=3.0, help='Guidance scale for sampling. Range [1.0, 3.0]. A measure of speaker similarity.')
    parser.add_argument('--temperature', type=float, default=1.0, help='Temperature for sampling. Applied to both LLMs.')
    parser.add_argument('--output_dir', type=str, default="outputs", help='Directory to save the output speech files.')
    parser.add_argument('--seed', type=int, default=None, help='Seed for reproducibility (default/if none is provided: generate a random seed).')
    return parser.parse_args()  # Return the parsed arguments

# Define a class for the Text-to-Speech (TTS) functionality
class TTS:

    # Define a constant for the end-of-audio token
    END_OF_AUDIO_TOKEN = 1024

    # Initialize the TTS object with model information and output directory
    def __init__(
        self, model_name: str = "metavoiceio/metavoice-1B-v0.1", *, seed: int = None, output_dir: str = "outputs"
    ):

        # Check if seed is None and generate a random seed between 0 and 2000 if true
        if seed is None:
            seed = random.randint(0, 2000)

        # Display the seed.
        print(f"Seed: {seed}")

        # Set the default data type and device for PyTorch operations
        self._dtype = get_default_dtype()
        self._device = get_device()

        # Download the model directory from Hugging Face Hub
        self._model_dir = snapshot_download(repo_id=model_name)

        # Initialize the first stage adapter with the end-of-audio token
        self.first_stage_adapter = FlattenedInterleavedEncodec2Codebook(end_of_audio_token=self.END_OF_AUDIO_TOKEN)

        # Set the output directory and create it if it doesn't exist
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

        # Configure and initialize the second stage of the model
        second_stage_ckpt_path = f"{self._model_dir}/second_stage.pt"
        config_second_stage = InferenceConfig(
            ckpt_path=second_stage_ckpt_path,
            num_samples=1,
            seed=seed,
            device=self._device,
            dtype=self._dtype,
            compile=False,
            init_from="resume",
            output_dir=self.output_dir,
        )

        # Initialize the data adapter for the second stage
        data_adapter_second_stage = TiltedEncodec(end_of_audio_token=self.END_OF_AUDIO_TOKEN)

        # Create the model for the second stage
        self.llm_second_stage = Model(
            config_second_stage, TrainedBPETokeniser, EncodecDecoder, data_adapter_fn=data_adapter_second_stage.decode
        )

        # Initialize the audio enhancer
        self.enhancer = get_enhancer("df")

        # Set the precision for the model based on the specified data type
        self.precision = {"float16": torch.float16, "bfloat16": torch.bfloat16}[self._dtype]

        # Build the model, tokenizer, and speaker model, and get the model size
        self.model, self.tokenizer, self.smodel, self.model_size = build_model(
            precision=self.precision,
            checkpoint_path=Path(f"{self._model_dir}/first_stage.pt"),
            spk_emb_ckpt_path=Path(f"{self._model_dir}/speaker_encoder.pt"),
            device=self._device,
            compile=True,
            compile_prefill=True,
        )

    # Define the method to synthesize speech from text
    def synthesise(self, text: str, spk_ref_path: str, top_p=0.95, guidance_scale=3.0, temperature=1.0) -> str:

        # Normalize the input text
        text = normalize_text(text)

        # Get the cached file for the speaker reference path
        spk_ref_path = get_cached_file(spk_ref_path)

        # Check if the audio file is valid
        check_audio_file(spk_ref_path)

        # Get the cached speaker embedding
        spk_emb = get_cached_embedding(
            spk_ref_path,
            self.smodel,
        ).to(device=self._device, dtype=self.precision)

        # Record the start time for synthesis
        start = time.time()
        # Generate tokens using the first stage LLM
        tokens = main(
            model=self.model,
            tokenizer=self.tokenizer,
            model_size=self.model_size,
            prompt=text,
            spk_emb=spk_emb,
            top_p=torch.tensor(top_p, device=self._device, dtype=self.precision),
            guidance_scale=torch.tensor(guidance_scale, device=self._device, dtype=self.precision),
            temperature=torch.tensor(temperature, device=self._device, dtype=self.precision),
        )
        # Decode the extracted audio IDs from the tokens
        _, extracted_audio_ids = self.first_stage_adapter.decode([tokens])

        # Prepare speaker embeddings for batch processing
        b_speaker_embs = spk_emb.unsqueeze(0)

        # Generate waveform files using the second stage LLM and multi-band diffusion model
        wav_files = self.llm_second_stage(
            texts=[text],
            encodec_tokens=[torch.tensor(extracted_audio_ids, dtype=torch.int32, device=self._device).unsqueeze(0)],
            speaker_embs=b_speaker_embs,
            batch_size=1,
            guidance_scale=None,
            top_p=None,
            top_k=200,
            temperature=1.0,
            max_new_tokens=None,
        )

        # Enhance the generated waveform using DeepFilterNet
        wav_file = wav_files[0]

        # Print the type of wav_file.
        with tempfile.NamedTemporaryFile(suffix=".wav") as enhanced_tmp:

            # Print enhanced_tmp.name
            self.enhancer(str(wav_file) + ".wav", enhanced_tmp.name)
            shutil.copy2(enhanced_tmp.name, str(wav_file) + ".wav")
            print(f"\nSaved audio to {wav_file}.wav")

            # Copy the enhanced audio to the output directory
            output_file = self.output_dir + "/output.wav"
            shutil.copy2(enhanced_tmp.name, output_file)

            # Set the synthesized speech file's permissions so it's readable and writable by everyone
            os.chmod(output_file, 0o666)

            # Change the ownership of the synthesized speech file to user and group with ID 1000
            # Note: This operation requires superuser privileges
            user_id = 1000
            group_id = 1000
            os.chown(output_file, user_id, group_id)

        # Calculate the real-time factor (RTF) of the synthesis process
        time_to_synth_s = time.time() - start
        audio, sr = librosa.load(str(wav_file) + ".wav")
        duration_s = librosa.get_duration(y=audio, sr=sr)
        print(f"\nTotal time to synth (s): {time_to_synth_s}")
        print(f"Real-time factor: {time_to_synth_s / duration_s:.2f}")

        # Return the path to the synthesized speech file
        return str(wav_file) + ".wav"

# Check if this script is the main program and execute the main logic
if __name__ == "__main__":

    # Parse command-line arguments
    args = parse_args()

    # Initialize the TTS system with the output directory
    tts = TTS(output_dir=args.output_dir, seed=args.seed)

    # Synthesize speech from the provided text and speaker reference
    synthesized_speech_path = tts.synthesise(text=args.text, spk_ref_path=args.spk_ref_path, top_p=args.top_p, guidance_scale=args.guidance_scale, temperature=args.temperature)

    # Output the path to the synthesized speech
    print(f"Synthesized speech saved to: {synthesized_speech_path}")

    # Set the synthesized speech file's permissions so it's readable and writable by everyone
    os.chmod(synthesized_speech_path, 0o666)

    # Change the ownership of the synthesized speech file to user and group with ID 1000
    # Note: This operation requires superuser privileges
    user_id = 1000
    group_id = 1000
    os.chown(synthesized_speech_path, user_id, group_id)

thanks for any help/ideas.

Also, what is the range for parameters like temperature, top-p, and guidance scale ?

vatsalaggarwal commented 8 months ago

Sounds like a bug! Will try to trace it down in a bit, but most likely seed isn't being propagated properly?

Temperature - 0 to 1 top_p - also 0 to 1 Guidance - 1 to 3

arthurwolf commented 8 months ago

Thanks a lot for the answer.

I'm off to bed now, but if you don't fix this by friday, at that point I'll try to follow the variable through the code and see if I find where it's broken.

Additional info: it generates the same files, to the bit:

fdc1403342e691b0beb2f8bb0b64486c  /ram/read-0-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-0-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-10-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-10-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-11-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-11-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-1-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-1-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-2-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-2-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-3-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-3-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-4-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-4-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-5-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-5-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-6-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-6-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-7-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-7-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-8-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-8-vocals.wav
fdc1403342e691b0beb2f8bb0b64486c  /ram/read-9-base.wav
2ae585c28ec2c3b9b1286c6ef0fbb69d  /ram/read-9-vocals.wav

vatsalaggarwal commented 8 months ago

I don't think I'll have the time before the weekend either

arthurwolf commented 8 months ago

@vatsalaggarwal this is the culprit: https://github.com/metavoiceio/metavoice-src/blob/main/fam/llm/fast_inference_utils.py#L334

By the way, I was a bit confused by the docs. It says to use fast_inference.py, but that file doesn't have command line parameters. I see inference.py has command line parameters, but that file isn't mentioned in the docs. What is the difference between the two, and which is supposed to be used ?

vatsalaggarwal commented 8 months ago

ugh, sorry about that! I've just searched for "seed(" throughout our codebase, and that seems like the only culprit.
sorry about this... here is the relevant section: https://github.com/metavoiceio/metavoice-src?tab=readme-ov-file#usagehttps://github.com/metavoiceio/metavoice-src?tab=readme-ov-file#usage ... yes, it has no command line params, we're expecting people to use it in interactive mode. (reason: this is a faster version that uses a "compiled" model... compilation can take up to 2 minutes, and so there's no point in having a cli version that restarts the python process everytime as you'll have to pay for compilation again).
re: the difference between inference.py and fast_inference.py, the latter is the compiled version and used in the interactive mode as above whereasinference.py doesn't do compilation and doesn't have all the inference optimisations, but critically doesn't use compilation, so it can be used in cli form...

we haven't had the chance to improve this experience yet

arthurwolf commented 8 months ago

thanks for the info!

we haven't had the chance to improve this experience yet

would you be interrested in a PR with a few changes to code and docs?

a few more questions:

fast_inference seems to generate more "garbled" speech than the online demo I found (replicate?). is this about parameters? or does it use a different model? what's going on there?
in case that would help, I'm trying to use inference.py, but that runs out of VRAM it seems (I'm on a 12GB card with about 10GB free), what's the size of the fast and non-fast models in VRAM? I tried running with --cpu so VRAM is no longer a concern, and I get:

Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 140591.20it/s]
number of parameters: 1239.00M
number of parameters: 14.07M
getting cached speaker ref files: 100%|██████████| 1/1 [00:00<00:00, 19065.02it/s]
calculating speaker embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.18it/s]
tokens:   0%|          | 0/1728 [00:00<?, ?it/s]
batch:   0%|          | 0/1 [00:00<?, ?it/s]t/s]
[hack!!!!] Guidance is on, so we're doubling/tripling batch size!
Traceback (most recent call last):
  File "/app/fam/llm/inference.py", line 700, in <module>
    sample_utterance(
  File "/app/fam/llm/inference.py", line 543, in sample_utterance
    return _sample_utterance_batch(
  File "/app/fam/llm/inference.py", line 472, in _sample_utterance_batch
    b_tokens = first_stage_model(
  File "/app/fam/llm/inference.py", line 354, in __call__
    return self.causal_sample(
  File "/app/fam/llm/inference.py", line 229, in causal_sample
    y = self.model.generate(
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/app/fam/llm/model.py", line 369, in generate
    return self._causal_sample(
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/app/fam/llm/mixins/causal.py", line 410, in _causal_sample
    batch_idx = self._sample_batch(
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/app/fam/llm/mixins/causal.py", line 264, in _sample_batch
    idx_next = self._sample_next_token(
  File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "/app/fam/llm/mixins/causal.py", line 85, in _sample_next_token
    list_logits, _ = self(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/app/fam/llm/model.py", line 282, in forward
    x = block(x)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/app/fam/llm/layers/combined.py", line 50, in forward
    x = x + self.attn(self.ln_1(x))
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1520, in _call_impl
    return forward_call(*args, **kwargs)
  File "/app/fam/llm/layers/attn.py", line 178, in forward
    y = self._torch_attn(c_x)
  File "/app/fam/llm/layers/attn.py", line 148, in _torch_attn
    y = torch.nn.functional.scaled_dot_product_attention(
RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype: float key.dtype: c10::BFloat16 and value.dtype: c10::BFloat16 instead.

Edit: I think --dtype float32 is getting me further.

vatsalaggarwal commented 8 months ago

would you be interrested in a PR with a few changes to code and docs?

yes, that would be great!

fast_inference seems to generate more "garbled" speech than the online demo I found (replicate?). is this about parameters? or does it use a different model? what's going on there?

that's weird, first time we've heard that... can you send me the link to the online demo you're talking about? it's likely that you ended up coming across a bad seed or that the online demo you're referring to is using a different reference speaker (garbling/artefacts/etc are super sensitive to the reference you use), but let me just double check once i have the link!

in case that would help, I'm trying to use inference.py, but that runs out of VRAM it seems (I'm on a 12GB card with about 10GB free), what's the size of the fast and non-fast models in VRAM? I tried running with --cpu so VRAM is no longer a concern, and I get

I don't how well torch supports compilation on CPU, etc, so I probably wouldn't debug this on CPU. I think the fast version should use slightly more memory than non-fast (they are the exact same model, just difference in overheads). For fitting it on 12GB, I would recommend swapping out the diffusion model for the Vocos model. You could additionally try quantisation for the first stage model.

RuntimeError: Expected query, key, and value to have the same dtype, but got query.dtype: float key.dtype: c10::BFloat16 and value.dtype: c10::BFloat16 instead.

Think either the device or dtypes aren't being passed around correctly, will need to be tracked down. Note that I don't think CPUs support bfloat16 calculation, so this is unlikely to work regardless

arthurwolf commented 8 months ago

can you send me the link to the online demo you're talking about?

https://ttsdemo.themetavoice.xyz/

Two issues:

There are two sliders, and they don't seem to match the parameters in inference.py and fast_inference.py (both in terms of names and min/max values).
The page doesn't really say what model/script is used behind the scene.

I get better output from that demo than from running it locally, so I'd really like some way to reproduce what that page does.

you ended up coming across a bad seed

Consistently bad across 16 tries. I'll do some demo/reproduction later on to show you.

garbling/artefacts/etc

The sample is pretty clean, and works better in the online demo / with xtts (https://huggingface.co/spaces/coqui/xtts).

I would recommend swapping out the diffusion model for the Vocos model.

Is that trivial, or more involved? I'm a user of AI stuff, not really a designer of it. Too few PHDs (ie zero).

You could additionally try quantisation for the first stage model.

Same question.

Is there something, anything, I can do using the command line parameters, or some simple modification of the code, that would result in lower VRAM usage?

Edit: I'm playing around with the value of --guidance_scale on fast_inference.py to understand the min/max values, and at 4.0 it works, but at 8.0 I run out of VRAM. Does that mean this is something I could play with to get inference.py to fit in the VRAM?

I don't how well torch supports compilation on CPU,

In my experience with other projects it's always been fine.

I was able to fully run inference.py on CPU with compilation enabled and with it disabled, so your code/project is better than you presumed :) Took 25 minutes though.

Note, even with --device cpu, inference.py still used 4.5GB of VRAM. That means --device is propagated only to some parts of the code?

I think the fast version should use slightly more memory than non-fast (they are the exact same model, just difference in overheads).

On my machine, the fast version runs fine on the GPU, the slow version doesn't. Is there some way to know how much each "requires"/needs ? I'd like to know what (minimum) size GPU I need to buy for this to work.

Note that I don't think CPUs support bfloat16 calculation, so this is unlikely to work regardless

I was able to get it to work, on cpu, by setting --dtype float32.

PS: Question: the docs say 45-90 seconds for the sample. Mine is 50 seconds. Should I expect better results with a 90 seconds one? Would a 5 minute sample do even better (even if by a little) than a 90 second one?

PS: Some data on trying to find the bounds of the parameters.

# nop: 0.001 zip: 0.742 log:-0.140 tmp: 1.015 sca: 0.316 see:1123.000 dur:   99.77s buf:      mpv /ram/read-3-base.wav voc:    mpv /ram/read-3-vocals.wav ups:   mpv /ram/read-3-upscale.wav sco:  0.00 « Hey, look, she's awake.» 
# nop: 0.003 zip: 0.733 log:-0.476 tmp: 0.997 sca: 0.557 see:213.000 dur:  110.53s buf:      mpv /ram/read-6-base.wav voc:    mpv /ram/read-6-vocals.wav ups:   mpv /ram/read-6-upscale.wav sco:  0.00 « Hey look, she's awake.» 
# nop: 0.011 zip: 0.733 log:-0.285 tmp: 0.955 sca: 0.637 see:788.000 dur:  109.80s buf:     mpv /ram/read-10-base.wav voc:   mpv /ram/read-10-vocals.wav ups:  mpv /ram/read-10-upscale.wav sco:  0.00 « Hey look, she's awake.» 
# nop: 0.001 zip: 0.742 log:-0.197 tmp: 0.990 sca: 0.365 see:87.000 dur:  109.05s buf:     mpv /ram/read-34-base.wav voc:   mpv /ram/read-34-vocals.wav ups:  mpv /ram/read-34-upscale.wav sco:  0.00 « Hey, look, she's awake.» 
# nop: 0.003 zip: 0.742 log:-0.350 tmp: 0.994 sca: 0.305 see:1785.000 dur:  115.96s buf:     mpv /ram/read-40-base.wav voc:   mpv /ram/read-40-vocals.wav ups:  mpv /ram/read-40-upscale.wav sco:  0.00 « Hey, look! She's awake!» 
# nop: 0.055 zip: 0.742 log:-0.470 tmp: 0.936 sca: 0.761 see:676.000 dur:  109.78s buf:     mpv /ram/read-41-base.wav voc:   mpv /ram/read-41-vocals.wav ups:  mpv /ram/read-41-upscale.wav sco:  0.00 « Hey, look, she's awake.» 
# nop: 0.020 zip: 0.742 log:-0.542 tmp: 0.922 sca: 0.227 see:891.000 dur:  125.63s buf:     mpv /ram/read-56-base.wav voc:   mpv /ram/read-56-vocals.wav ups:  mpv /ram/read-56-upscale.wav sco:  0.00 « Hey, look. She's awake.» 
# nop: 0.042 zip: 0.724 log:-0.792 tmp: 0.942 sca: 0.517 see:1375.000 dur:  115.87s buf:     mpv /ram/read-38-base.wav voc:   mpv /ram/read-38-vocals.wav ups:  mpv /ram/read-38-upscale.wav sco:  2.00 « Hey Lou, she's awake.» 
# nop: 0.009 zip: 0.667 log:-0.328 tmp: 0.974 sca: 0.654 see:1661.000 dur:  131.88s buf:      mpv /ram/read-2-base.wav voc:    mpv /ram/read-2-vocals.wav ups:   mpv /ram/read-2-upscale.wav sco:  3.00 « look she's awake» 
# nop: 0.021 zip: 0.733 log:-0.559 tmp: 0.926 sca: 0.488 see: 2.000 dur:  137.42s buf:      mpv /ram/read-4-base.wav voc:    mpv /ram/read-4-vocals.wav ups:   mpv /ram/read-4-upscale.wav sco:  3.00 « And look, she's awake.» 
# nop: 0.003 zip: 0.733 log:-0.190 tmp: 0.958 sca: 0.225 see:312.000 dur:  102.36s buf:     mpv /ram/read-15-base.wav voc:   mpv /ram/read-15-vocals.wav ups:  mpv /ram/read-15-upscale.wav sco:  3.00 « And look, she's awake.» 
# nop: 0.023 zip: 0.733 log:-0.472 tmp: 1.041 sca: 0.298 see:141.000 dur:  114.16s buf:     mpv /ram/read-17-base.wav voc:   mpv /ram/read-17-vocals.wav ups:  mpv /ram/read-17-upscale.wav sco:  3.00 « Hey Luke, she's awake.» 
# nop: 0.001 zip: 0.742 log:-0.364 tmp: 1.046 sca: 0.225 see:704.000 dur:  108.65s buf:     mpv /ram/read-21-base.wav voc:   mpv /ram/read-21-vocals.wav ups:  mpv /ram/read-21-upscale.wav sco:  3.00 « He looked, she's awake.» 
# nop: 0.003 zip: 0.758 log:-0.245 tmp: 0.952 sca: 0.448 see:874.000 dur:  124.89s buf:     mpv /ram/read-52-base.wav voc:   mpv /ram/read-52-vocals.wav ups:  mpv /ram/read-52-upscale.wav sco:  4.00 « Hidden look, she's awake.» 
# nop: 0.008 zip: 0.765 log:-0.292 tmp: 0.987 sca: 0.426 see:364.000 dur:  112.70s buf:     mpv /ram/read-13-base.wav voc:   mpv /ram/read-13-vocals.wav ups:  mpv /ram/read-13-upscale.wav sco:  5.00 « Can you look? She's awake.» 
# nop: 0.003 zip: 0.750 log:-0.481 tmp: 0.942 sca: 0.791 see:1145.000 dur:  111.07s buf:     mpv /ram/read-14-base.wav voc:   mpv /ram/read-14-vocals.wav ups:  mpv /ram/read-14-upscale.wav sco:  5.00 « still look, she's awake.» 
# nop: 0.002 zip: 0.771 log:-0.146 tmp: 0.877 sca: 0.294 see:1788.000 dur:  102.38s buf:     mpv /ram/read-16-base.wav voc:   mpv /ram/read-16-vocals.wav ups:  mpv /ram/read-16-upscale.wav sco:  6.00 « Hayden looked. She's awake.» 
# nop: 0.096 zip: 0.600 log:-0.740 tmp: 0.956 sca: 0.455 see:1934.000 dur:  110.37s buf:      mpv /ram/read-5-base.wav voc:    mpv /ram/read-5-vocals.wav ups:   mpv /ram/read-5-upscale.wav sco:  7.00 « She's awake.» 
# nop: 0.044 zip: 0.600 log:-0.734 tmp: 1.012 sca: 0.723 see:986.000 dur:  106.72s buf:     mpv /ram/read-51-base.wav voc:   mpv /ram/read-51-vocals.wav ups:  mpv /ram/read-51-upscale.wav sco:  7.00 « She's awake.» 
# nop: 0.013 zip: 0.680 log:-0.335 tmp: 0.872 sca: 0.734 see:928.000 dur:  107.74s buf:     mpv /ram/read-24-base.wav voc:   mpv /ram/read-24-vocals.wav ups:  mpv /ram/read-24-upscale.wav sco:  8.00 « Emma, she's away.» 
# nop: 0.261 zip: 0.742 log:-0.789 tmp: 0.874 sca: 0.581 see:1775.000 dur:  115.67s buf:     mpv /ram/read-42-base.wav voc:   mpv /ram/read-42-vocals.wav ups:  mpv /ram/read-42-upscale.wav sco:  8.00 « And look, she's only...» 
# nop: 0.021 zip: 0.724 log:-0.709 tmp: 0.942 sca: 0.556 see:1065.000 dur:  108.68s buf:     mpv /ram/read-46-base.wav voc:   mpv /ram/read-46-vocals.wav ups:  mpv /ram/read-46-upscale.wav sco:  9.00 « Skin low, shoes away.» 
# nop: 0.121 zip: 0.750 log:-0.644 tmp: 0.851 sca: 0.379 see:874.000 dur:  108.92s buf:     mpv /ram/read-48-base.wav voc:   mpv /ram/read-48-vocals.wav ups:  mpv /ram/read-48-upscale.wav sco:  9.00 « Picking up, she's awake.» 
# nop: 0.095 zip: 0.652 log:-0.517 tmp: 0.965 sca: 0.635 see:263.000 dur:  110.03s buf:      mpv /ram/read-7-base.wav voc:    mpv /ram/read-7-vocals.wav ups:   mpv /ram/read-7-upscale.wav sco: 10.00 « And she's weak.» 
# nop: 0.043 zip: 0.636 log:-0.648 tmp: 1.036 sca: 0.410 see:353.000 dur:  105.36s buf:     mpv /ram/read-18-base.wav voc:   mpv /ram/read-18-vocals.wav ups:  mpv /ram/read-18-upscale.wav sco: 10.00 « Here's a word.» 
# nop: 0.013 zip: 0.750 log:-0.720 tmp: 0.851 sca: 0.503 see:1590.000 dur:  115.52s buf:     mpv /ram/read-33-base.wav voc:   mpv /ram/read-33-vocals.wav ups:  mpv /ram/read-33-upscale.wav sco: 10.00 « Fear not, she's in vain.» 
# nop: 0.019 zip: 0.784 log:-0.882 tmp: 0.959 sca: 0.556 see:1545.000 dur:  115.43s buf:     mpv /ram/read-44-base.wav voc:   mpv /ram/read-44-vocals.wav ups:  mpv /ram/read-44-upscale.wav sco: 10.00 « Hey, look. Keep away from it.» 
# nop: 0.044 zip: 0.692 log:-0.572 tmp: 0.946 sca: 0.696 see:988.000 dur:  124.17s buf:     mpv /ram/read-49-base.wav voc:   mpv /ram/read-49-vocals.wav ups:  mpv /ram/read-49-upscale.wav sco: 10.00 « The work is a win.» 
# nop: 0.087 zip: 0.765 log:-0.409 tmp: 0.968 sca: 0.598 see:1311.000 dur:  140.26s buf:     mpv /ram/read-25-base.wav voc:   mpv /ram/read-25-vocals.wav ups:  mpv /ram/read-25-upscale.wav sco: 12.00 « Yeah, no, she's all right.» 
# nop: 0.018 zip: 0.769 log:-0.672 tmp: 0.936 sca: 0.322 see:633.000 dur:  104.84s buf:     mpv /ram/read-31-base.wav voc:   mpv /ram/read-31-vocals.wav ups:  mpv /ram/read-31-upscale.wav sco: 12.00 « This is not the way.» 
# nop: 0.025 zip: 0.724 log:-0.312 tmp: 1.024 sca: 0.680 see:800.000 dur:  110.05s buf:     mpv /ram/read-54-base.wav voc:   mpv /ram/read-54-vocals.wav ups:  mpv /ram/read-54-upscale.wav sco: 12.00 « and what she's doing.» 
# nop: 0.714 zip: 0.385 log:-0.737 tmp: 0.950 sca: 0.515 see:747.000 dur:  140.31s buf:     mpv /ram/read-29-base.wav voc:   mpv /ram/read-29-vocals.wav ups:  mpv /ram/read-29-upscale.wav sco: 13.00 « Okay.» 
# nop: 0.381 zip: 0.556 log:-0.852 tmp: 0.889 sca: 0.729 see:1393.000 dur:  137.36s buf:     mpv /ram/read-47-base.wav voc:   mpv /ram/read-47-vocals.wav ups:  mpv /ram/read-47-upscale.wav sco: 13.00 « Okay, now.» 
# nop: 0.037 zip: 0.636 log:-0.902 tmp: 0.975 sca: 0.388 see:532.000 dur:  193.15s buf:     mpv /ram/read-27-base.wav voc:   mpv /ram/read-27-vocals.wav ups:  mpv /ram/read-27-upscale.wav sco: 14.00 « How you doing?» 
# nop: 0.029 zip: 0.724 log:-0.475 tmp: 0.933 sca: 0.762 see:997.000 dur:  101.75s buf:     mpv /ram/read-30-base.wav voc:   mpv /ram/read-30-vocals.wav ups:  mpv /ram/read-30-upscale.wav sco: 14.00 « Do not choose a link.» 
# nop: 0.462 zip: 0.680 log:-0.796 tmp: 0.856 sca: 0.851 see:211.000 dur:  103.88s buf:     mpv /ram/read-50-base.wav voc:   mpv /ram/read-50-vocals.wav ups:  mpv /ram/read-50-upscale.wav sco: 14.00 « Okay, next video.» 
# nop: 0.518 zip: 0.556 log:-0.339 tmp: 0.923 sca: 0.676 see:1386.000 dur:  138.42s buf:      mpv /ram/read-0-base.wav voc:    mpv /ram/read-0-vocals.wav ups:   mpv /ram/read-0-upscale.wav sco: 15.00 « Thank you.» 
# nop: 0.123 zip: 0.385 log:-0.744 tmp: 1.017 sca: 0.537 see:126.000 dur:  132.90s buf:      mpv /ram/read-1-base.wav voc:    mpv /ram/read-1-vocals.wav ups:   mpv /ram/read-1-upscale.wav sco: 15.00 « Amen.» 
# nop: 0.192 zip: 0.385 log:-0.350 tmp: 0.856 sca: 0.695 see:351.000 dur:  138.78s buf:      mpv /ram/read-8-base.wav voc:    mpv /ram/read-8-vocals.wav ups:   mpv /ram/read-8-upscale.wav sco: 15.00 « Amen.» 
# nop: 0.953 zip: 0.556 log:-0.567 tmp: 0.877 sca: 0.824 see:1567.000 dur:  134.00s buf:     mpv /ram/read-12-base.wav voc:   mpv /ram/read-12-vocals.wav ups:  mpv /ram/read-12-upscale.wav sco: 15.00 « Thank you.» 
# nop: 0.445 zip: 0.556 log:-0.812 tmp: 0.974 sca: 0.568 see:1900.000 dur:  138.16s buf:     mpv /ram/read-20-base.wav voc:   mpv /ram/read-20-vocals.wav ups:  mpv /ram/read-20-upscale.wav sco: 15.00 « Thank you.» 
# nop: 0.028 zip: 0.806 log:-0.681 tmp: 0.946 sca: 0.882 see:471.000 dur:  112.61s buf:     mpv /ram/read-22-base.wav voc:   mpv /ram/read-22-vocals.wav ups:  mpv /ram/read-22-upscale.wav sco: 15.00 « And you know, this is why» 
# nop: 0.028 zip: 0.556 log:-0.115 tmp: 1.041 sca: 0.739 see:1247.000 dur:  141.43s buf:     mpv /ram/read-32-base.wav voc:   mpv /ram/read-32-vocals.wav ups:  mpv /ram/read-32-upscale.wav sco: 15.00 « Beautiful.» 
# nop: 0.791 zip: 0.556 log:-0.779 tmp: 0.918 sca: 0.934 see:174.000 dur:  139.46s buf:     mpv /ram/read-35-base.wav voc:   mpv /ram/read-35-vocals.wav ups:  mpv /ram/read-35-upscale.wav sco: 15.00 « Thank you.» 
# nop: 0.801 zip: 0.556 log:-0.711 tmp: 0.887 sca: 0.871 see:41.000 dur:  139.71s buf:     mpv /ram/read-36-base.wav voc:   mpv /ram/read-36-vocals.wav ups:  mpv /ram/read-36-upscale.wav sco: 15.00 « Thank you.» 
# nop: 0.285 zip: 0.556 log:-0.619 tmp: 1.020 sca: 0.793 see:1761.000 dur:  138.66s buf:     mpv /ram/read-37-base.wav voc:   mpv /ram/read-37-vocals.wav ups:  mpv /ram/read-37-upscale.wav sco: 15.00 « Thank you.» 
# nop: 0.727 zip: 0.556 log:-0.795 tmp: 0.877 sca: 0.254 see:988.000 dur:  137.33s buf:     mpv /ram/read-39-base.wav voc:   mpv /ram/read-39-vocals.wav ups:  mpv /ram/read-39-upscale.wav sco: 15.00 « Thank you.» 
# nop: 0.518 zip: 0.556 log:-0.339 tmp: 1.010 sca: 0.973 see:161.000 dur:  142.83s buf:     mpv /ram/read-45-base.wav voc:   mpv /ram/read-45-vocals.wav ups:  mpv /ram/read-45-upscale.wav sco: 15.00 « Thank you.» 
# nop: 0.761 zip: 0.556 log:-0.790 tmp: 0.859 sca: 0.806 see:1823.000 dur:  139.50s buf:     mpv /ram/read-53-base.wav voc:   mpv /ram/read-53-vocals.wav ups:  mpv /ram/read-53-upscale.wav sco: 15.00 « Thank you.» 
# nop:     * zip:     * log:     * dur:  116.23s buf:      mpv /ram/read-9-base.wav voc:    mpv /ram/read-9-vocals.wav ups:   mpv /ram/read-9-upscale.wav sco: 16.00 «» 
# nop:     * zip:     * log:     * dur:  105.78s buf:     mpv /ram/read-11-base.wav voc:   mpv /ram/read-11-vocals.wav ups:  mpv /ram/read-11-upscale.wav sco: 16.00 «» 
# nop: 0.156 zip: 0.765 log:-0.760 tmp: 0.878 sca: 0.608 see:1627.000 dur:  105.85s buf:     mpv /ram/read-26-base.wav voc:   mpv /ram/read-26-vocals.wav ups:  mpv /ram/read-26-upscale.wav sco: 16.00 « It's only love today, man.» 
# nop:     * zip:     * log:     * dur:  127.11s buf:     mpv /ram/read-43-base.wav voc:   mpv /ram/read-43-vocals.wav ups:  mpv /ram/read-43-upscale.wav sco: 16.00 «» 
# nop: 0.046 zip: 0.778 log:-0.612 tmp: 0.885 sca: 0.302 see:259.000 dur:  124.92s buf:     mpv /ram/read-23-base.wav voc:   mpv /ram/read-23-vocals.wav ups:  mpv /ram/read-23-upscale.wav sco: 17.00 « The game is not in your way.» 
# nop: 0.014 zip: 0.784 log:-0.408 tmp: 1.046 sca: 0.613 see:624.000 dur:  107.71s buf:     mpv /ram/read-55-base.wav voc:   mpv /ram/read-55-vocals.wav ups:  mpv /ram/read-55-upscale.wav sco: 17.00 « Ain't about to see their way.» 
# nop: 0.167 zip: 0.810 log:-0.912 tmp: 1.014 sca: 0.677 see:1170.000 dur:  126.88s buf:     mpv /ram/read-28-base.wav voc:   mpv /ram/read-28-vocals.wav ups:  mpv /ram/read-28-upscale.wav sco: 21.00 « I mean, who would love to do that?» 
# nop: 0.025 zip: 0.902 log:-0.680 tmp: 1.018 sca: 0.760 see: 1.000 dur:  108.40s buf:     mpv /ram/read-19-base.wav voc:   mpv /ram/read-19-vocals.wav ups:  mpv /ram/read-19-upscale.wav sco: 24.00 « during the month she was doing weight»

using whisper to get text from the generated sound, and scoring based on the levensthein distance between the intended and understood. this is the 4th round of 4, each getting closer, I'm finding the best results with temperature around 0.95 (default), and guidance scale as low as possible, but the lower it is the further the voice is from what I need, the higher it is, the less understandable the voice is. tomorrow I'll make a new voice sample, hoping it helps.

vatsalaggarwal commented 8 months ago

There are two sliders, and they don't seem to match the parameters in inference.py and fast_inference.py (both in terms of names and min/max values).

Yep, we've scaled the top_p and guidance_scale parameters to make them more convenient. See https://github.com/metavoiceio/metavoice-src/blob/main/app.py#L29-L36 for reference.

Consistently bad across 16 tries

Just to double check, did a new seed get used on each try? It's possible each try ended up using the same seed...

I'll do some demo/reproduction later on to show you.

this would be super helpful!

re the rest:

would say that swapping out vocos for mbd might take some work
different guidance_scale values should not impact VRAM usage
yeah, it looks like device isn't being propagated properly... i think one of the draft PRs currently open fixes this but it is lagging behind main unfortunately
We recommend a GPU with 24GB VRAM minimum, and don't internally test on GPUs below that spec.
float16 should also work fine on CPU, and might speed you up?
hard to know what's going on with the samples... if you can share it with me at vatsal@themetavoice.xyz that's ideal! longer samples can help, but quality/compression of the sample is equally important.

metavoiceio / metavoice-src

Changing the seed does nothing. #86