gladia-research-group / multi-source-diffusion-models

145 stars 11 forks source link

Separating mixture file ? #7

Open bubblegg opened 10 months ago

bubblegg commented 10 months ago

Is it possible to separate my provided input mixture file and how? Thank you in advance!

PreFKim commented 2 months ago

I separated my custom mixture file, but the result did not meet my expectations.

  1. Setting Params
    
    import sys
    from pathlib import Path
    from typing import *
    import torch

DEVICE = torch.device("cuda:0") SAMPLE_RATE = 22050 # < IMPORTANT: do not change STEMS = ["bass","drums","guitar","piano"] # < IMPORTANT: do not change ROOT_PATH = Path("..").resolve().absolute() CKPT_PATH = ROOT_PATH / "ckpts" DATA_PATH = ROOT_PATH / "data"

sys.path.append(str(ROOT_PATH)) %load_ext autoreload %autoreload 2

2. Load Model

```python
from main.module_base import Model

# Load model
model = Model.load_from_checkpoint(CKPT_PATH / f"glorious-star-335/epoch=729-valid_loss=0.014.ckpt").to(DEVICE)
denoise_fn = model.model.diffusion.denoise_fn
  1. Load mixture file
import soundfile as sf
import torch

audio, sr = sf.read('/.../music.wav')
audio = torch.from_numpy(audio.transpose(1,0).reshape(1,2,-1)) # seq_len, 2 -> 2, seq_len -> 1, 2, seq_len ( batch, stereo, seq_len)
print(audio.shape, sr) # If the audio's sampling rate is not 22050, you should adjust your audio file to match the target sampling rate.
  1. Separate
from main.separation import separate_mixture
from audio_diffusion_pytorch import KarrasSchedule
# Generation hyper-parameters
s_churn = 20.0
num_steps = 150
num_resamples = 2

# Define timestep schedule
schedule = KarrasSchedule(sigma_min=1e-4, sigma_max=20.0, rho=7)(num_steps, DEVICE)

start_idx = 0
sources = audio[:,:, start_idx:start_idx + 262144].to(DEVICE)
sources = ((sources[:,0:1] + sources[:,1:2])/2).float() # Stereo to mono

separated = separate_mixture(
    mixture= sources,
    denoise_fn= denoise_fn,
    sigmas=schedule,
    noises= torch.randn(1, 4, 262144).to(DEVICE),
    s_churn=s_churn, # > 0 to add randomness
    num_resamples= num_resamples,
)    
separated.shape
  1. Audio to file
import numpy as np
import soundfile as sf
separated = separated.detach().cpu().numpy().squeeze(0)

for i, stem in enumerate(STEMS):
    sf.write(
        f"./{stem}.wav",
        separated[i],
        22050,
        format="WAV"
    )
RP335 commented 12 hours ago

I separated my custom mixture file, but the result did not meet my expectations.

  1. Setting Params
import sys
from pathlib import Path
from typing import *
import torch

DEVICE = torch.device("cuda:0")
SAMPLE_RATE = 22050 # < IMPORTANT: do not change
STEMS = ["bass","drums","guitar","piano"] # < IMPORTANT: do not change
ROOT_PATH = Path("..").resolve().absolute()
CKPT_PATH = ROOT_PATH / "ckpts"
DATA_PATH = ROOT_PATH / "data"

sys.path.append(str(ROOT_PATH))
%load_ext autoreload
%autoreload 2
  1. Load Model
from main.module_base import Model

# Load model
model = Model.load_from_checkpoint(CKPT_PATH / f"glorious-star-335/epoch=729-valid_loss=0.014.ckpt").to(DEVICE)
denoise_fn = model.model.diffusion.denoise_fn
  1. Load mixture file
import soundfile as sf
import torch

audio, sr = sf.read('/.../music.wav')
audio = torch.from_numpy(audio.transpose(1,0).reshape(1,2,-1)) # seq_len, 2 -> 2, seq_len -> 1, 2, seq_len ( batch, stereo, seq_len)
print(audio.shape, sr) # If the audio's sampling rate is not 22050, you should adjust your audio file to match the target sampling rate.
  1. Separate
from main.separation import separate_mixture
from audio_diffusion_pytorch import KarrasSchedule
# Generation hyper-parameters
s_churn = 20.0
num_steps = 150
num_resamples = 2

# Define timestep schedule
schedule = KarrasSchedule(sigma_min=1e-4, sigma_max=20.0, rho=7)(num_steps, DEVICE)

start_idx = 0
sources = audio[:,:, start_idx:start_idx + 262144].to(DEVICE)
sources = ((sources[:,0:1] + sources[:,1:2])/2).float() # Stereo to mono

separated = separate_mixture(
    mixture= sources,
    denoise_fn= denoise_fn,
    sigmas=schedule,
    noises= torch.randn(1, 4, 262144).to(DEVICE),
    s_churn=s_churn, # > 0 to add randomness
    num_resamples= num_resamples,
)    
separated.shape
  1. Audio to file
import numpy as np
import soundfile as sf
separated = separated.detach().cpu().numpy().squeeze(0)

for i, stem in enumerate(STEMS):
    sf.write(
        f"./{stem}.wav",
        separated[i],
        22050,
        format="WAV"
    )

The audio sounds pitched and distorted I've done an implementation where I added my custom mixture file in evaluation/experiments.py and ran the script PYTHONPATH=. python evaluate.py exp=eval_msdm_dirac

code here https://github.com/RP335/msdm_custom you might have to tweak things to make it compatible with the models. I've used my own model which I trained on speech/non-speech data